diff --git a/site2/website/versioned_docs/version-2.10.x/about.md b/site2/website/versioned_docs/version-2.10.x/about.md new file mode 100644 index 0000000000000..478ac8dd053e8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/about.md @@ -0,0 +1,56 @@ +--- +slug: / +id: about +title: Welcome to the doc portal! +sidebar_label: "About" +--- + +import BlockLinks from "@site/src/components/BlockLinks"; +import BlockLink from "@site/src/components/BlockLink"; +import { docUrl } from "@site/src/utils/index"; + + +# Welcome to the doc portal! +*** + +This portal holds a variety of support documents to help you work with Pulsar . If you’re a beginner, there are tutorials and explainers to help you understand Pulsar and how it works. + +If you’re an experienced coder, review this page to learn the easiest way to access the specific content you’re looking for. + +## Get Started Now + + + + + + + + + +## Navigation +*** + +There are several ways to get around in the doc portal. The index navigation pane is a table of contents for the entire archive. The archive is divided into sections, like chapters in a book. Click the title of the topic to view it. + +In-context links provide an easy way to immediately reference related topics. Click the underlined term to view the topic. + +Links to related topics can be found at the bottom of each topic page. Click the link to view the topic. + +![Page Linking](/assets/page-linking.png) + +## Continuous Improvement +*** +As you probably know, we are working on a new user experience for our documentation portal that will make learning about and building on top of Apache Pulsar a much better experience. Whether you need overview concepts, how-to procedures, curated guides or quick references, we’re building content to support it. This welcome page is just the first step. We will be providing updates every month. + +## Help Improve These Documents +*** + +You’ll notice an Edit button at the bottom and top of each page. Click it to open a landing page with instructions for requesting changes to posted documents. These are your resources. Participation is not only welcomed – it’s essential! + +## Join the Community! +*** + +The Pulsar community on github is active, passionate, and knowledgeable. Join discussions, voice opinions, suggest features, and dive into the code itself. Find your Pulsar family here at [apache/pulsar](https://github.com/apache/pulsar). + +An equally passionate community can be found in the [Pulsar Slack channel](https://apache-pulsar.slack.com/). You’ll need an invitation to join, but many Github Pulsar community members are Slack members too. Join, hang out, learn, and make some new friends. + diff --git a/site2/website/versioned_docs/version-2.10.x/adaptors-kafka.md b/site2/website/versioned_docs/version-2.10.x/adaptors-kafka.md new file mode 100644 index 0000000000000..e738f9d94b6a9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/adaptors-kafka.md @@ -0,0 +1,276 @@ +--- +id: adaptors-kafka +title: Pulsar adaptor for Apache Kafka +sidebar_label: "Kafka client wrapper" +original_id: adaptors-kafka +--- + + +Pulsar provides an easy option for applications that are currently written using the [Apache Kafka](http://kafka.apache.org) Java client API. + +## Using the Pulsar Kafka compatibility wrapper + +In an existing application, change the regular Kafka client dependency and replace it with the Pulsar Kafka wrapper. Remove the following dependency in `pom.xml`: + +```xml + + + org.apache.kafka + kafka-clients + 0.10.2.1 + + +``` + +Then include this dependency for the Pulsar Kafka wrapper: + +```xml + + + org.apache.pulsar + pulsar-client-kafka + @pulsar:version@ + + +``` + +With the new dependency, the existing code works without any changes. You need to adjust the configuration, and make sure it points the +producers and consumers to Pulsar service rather than Kafka, and uses a particular +Pulsar topic. + +## Using the Pulsar Kafka compatibility wrapper together with existing kafka client + +When migrating from Kafka to Pulsar, the application might use the original kafka client +and the pulsar kafka wrapper together during migration. You should consider using the +unshaded pulsar kafka client wrapper. + +```xml + + + org.apache.pulsar + pulsar-client-kafka-original + @pulsar:version@ + + +``` + +When using this dependency, construct producers using `org.apache.kafka.clients.producer.PulsarKafkaProducer` +instead of `org.apache.kafka.clients.producer.KafkaProducer` and `org.apache.kafka.clients.producer.PulsarKafkaConsumer` for consumers. + +## Producer example + +```java + +// Topic needs to be a regular Pulsar topic +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); + +props.put("key.serializer", IntegerSerializer.class.getName()); +props.put("value.serializer", StringSerializer.class.getName()); + +Producer producer = new KafkaProducer(props); + +for (int i = 0; i < 10; i++) { + producer.send(new ProducerRecord(topic, i, "hello-" + i)); + log.info("Message {} sent successfully", i); +} + +producer.close(); + +``` + +## Consumer example + +```java + +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); +props.put("group.id", "my-subscription-name"); +props.put("enable.auto.commit", "false"); +props.put("key.deserializer", IntegerDeserializer.class.getName()); +props.put("value.deserializer", StringDeserializer.class.getName()); + +Consumer consumer = new KafkaConsumer(props); +consumer.subscribe(Arrays.asList(topic)); + +while (true) { + ConsumerRecords records = consumer.poll(100); + records.forEach(record -> { + log.info("Received record: {}", record); + }); + + // Commit last offset + consumer.commitSync(); +} + +``` + +## Complete Examples + +You can find the complete producer and consumer examples [here](https://github.com/apache/pulsar-adapters/tree/master/pulsar-client-kafka-compat/pulsar-client-kafka-tests/src/test/java/org/apache/pulsar/client/kafka/compat/examples). + +## Compatibility matrix + +Currently the Pulsar Kafka wrapper supports most of the operations offered by the Kafka API. + +### Producer + +APIs: + +| Producer Method | Supported | Notes | +|:------------------------------------------------------------------------------|:----------|:-------------------------------------------------------------------------| +| `Future send(ProducerRecord record)` | Yes | | +| `Future send(ProducerRecord record, Callback callback)` | Yes | | +| `void flush()` | Yes | | +| `List partitionsFor(String topic)` | No | | +| `Map metrics()` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | + +Properties: + +| Config property | Supported | Notes | +|:----------------------------------------|:----------|:------------------------------------------------------------------------------| +| `acks` | Ignored | Durability and quorum writes are configured at the namespace level | +| `auto.offset.reset` | Yes | It uses a default value of `earliest` if you do not give a specific setting. | +| `batch.size` | Ignored | | +| `bootstrap.servers` | Yes | | +| `buffer.memory` | Ignored | | +| `client.id` | Ignored | | +| `compression.type` | Yes | Allows `gzip` and `lz4`. No `snappy`. | +| `connections.max.idle.ms` | Yes | Only support up to 2,147,483,647,000(Integer.MAX_VALUE * 1000) ms of idle time| +| `interceptor.classes` | Yes | | +| `key.serializer` | Yes | | +| `linger.ms` | Yes | Controls the group commit time when batching messages | +| `max.block.ms` | Ignored | | +| `max.in.flight.requests.per.connection` | Ignored | In Pulsar ordering is maintained even with multiple requests in flight | +| `max.request.size` | Ignored | | +| `metric.reporters` | Ignored | | +| `metrics.num.samples` | Ignored | | +| `metrics.sample.window.ms` | Ignored | | +| `partitioner.class` | Yes | | +| `receive.buffer.bytes` | Ignored | | +| `reconnect.backoff.ms` | Ignored | | +| `request.timeout.ms` | Ignored | | +| `retries` | Ignored | Pulsar client retries with exponential backoff until the send timeout expires. | +| `send.buffer.bytes` | Ignored | | +| `timeout.ms` | Yes | | +| `value.serializer` | Yes | | + + +### Consumer + +The following table lists consumer APIs. + +| Consumer Method | Supported | Notes | +|:--------------------------------------------------------------------------------------------------------|:----------|:------| +| `Set assignment()` | No | | +| `Set subscription()` | Yes | | +| `void subscribe(Collection topics)` | Yes | | +| `void subscribe(Collection topics, ConsumerRebalanceListener callback)` | No | | +| `void assign(Collection partitions)` | No | | +| `void subscribe(Pattern pattern, ConsumerRebalanceListener callback)` | No | | +| `void unsubscribe()` | Yes | | +| `ConsumerRecords poll(long timeoutMillis)` | Yes | | +| `void commitSync()` | Yes | | +| `void commitSync(Map offsets)` | Yes | | +| `void commitAsync()` | Yes | | +| `void commitAsync(OffsetCommitCallback callback)` | Yes | | +| `void commitAsync(Map offsets, OffsetCommitCallback callback)` | Yes | | +| `void seek(TopicPartition partition, long offset)` | Yes | | +| `void seekToBeginning(Collection partitions)` | Yes | | +| `void seekToEnd(Collection partitions)` | Yes | | +| `long position(TopicPartition partition)` | Yes | | +| `OffsetAndMetadata committed(TopicPartition partition)` | Yes | | +| `Map metrics()` | No | | +| `List partitionsFor(String topic)` | No | | +| `Map> listTopics()` | No | | +| `Set paused()` | No | | +| `void pause(Collection partitions)` | No | | +| `void resume(Collection partitions)` | No | | +| `Map offsetsForTimes(Map timestampsToSearch)` | No | | +| `Map beginningOffsets(Collection partitions)` | No | | +| `Map endOffsets(Collection partitions)` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | +| `void wakeup()` | No | | + +Properties: + +| Config property | Supported | Notes | +|:--------------------------------|:----------|:------------------------------------------------------| +| `group.id` | Yes | Maps to a Pulsar subscription name | +| `max.poll.records` | Yes | | +| `max.poll.interval.ms` | Ignored | Messages are "pushed" from broker | +| `session.timeout.ms` | Ignored | | +| `heartbeat.interval.ms` | Ignored | | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `enable.auto.commit` | Yes | | +| `auto.commit.interval.ms` | Ignored | With auto-commit, acks are sent immediately to broker | +| `partition.assignment.strategy` | Ignored | | +| `auto.offset.reset` | Yes | Only support earliest and latest. | +| `fetch.min.bytes` | Ignored | | +| `fetch.max.bytes` | Ignored | | +| `fetch.max.wait.ms` | Ignored | | +| `interceptor.classes` | Yes | | +| `metadata.max.age.ms` | Ignored | | +| `max.partition.fetch.bytes` | Ignored | | +| `send.buffer.bytes` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `client.id` | Ignored | | + + +## Customize Pulsar configurations + +You can configure Pulsar authentication provider directly from the Kafka properties. + +### Pulsar client properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.authentication.class`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-org.apache.pulsar.client.api.Authentication-) | | Configure to auth provider. For example, `org.apache.pulsar.client.impl.auth.AuthenticationTls`.| +| [`pulsar.authentication.params.map`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.util.Map-) | | Map which represents parameters for the Authentication-Plugin. | +| [`pulsar.authentication.params.string`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.lang.String-) | | String which represents parameters for the Authentication-Plugin, for example, `key1:val1,key2:val2`. | +| [`pulsar.use.tls`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTls-boolean-) | `false` | Enable TLS transport encryption. | +| [`pulsar.tls.trust.certs.file.path`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsTrustCertsFilePath-java.lang.String-) | | Path for the TLS trust certificate store. | +| [`pulsar.tls.allow.insecure.connection`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsAllowInsecureConnection-boolean-) | `false` | Accept self-signed certificates from brokers. | +| [`pulsar.operation.timeout.ms`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setOperationTimeout-int-java.util.concurrent.TimeUnit-) | `30000` | General operations timeout. | +| [`pulsar.stats.interval.seconds`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setStatsInterval-long-java.util.concurrent.TimeUnit-) | `60` | Pulsar client lib stats printing interval. | +| [`pulsar.num.io.threads`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setIoThreads-int-) | `1` | The number of Netty IO threads to use. | +| [`pulsar.connections.per.broker`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConnectionsPerBroker-int-) | `1` | The maximum number of connection to each broker. | +| [`pulsar.use.tcp.nodelay`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTcpNoDelay-boolean-) | `true` | TCP no-delay. | +| [`pulsar.concurrent.lookup.requests`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConcurrentLookupRequest-int-) | `50000` | The maximum number of concurrent topic lookups. | +| [`pulsar.max.number.rejected.request.per.connection`](/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setMaxNumberOfRejectedRequestPerConnection-int-) | `50` | The threshold of errors to forcefully close a connection. | +| [`pulsar.keepalive.interval.ms`](/api/client/org/apache/pulsar/client/api/ClientBuilder.html#keepAliveInterval-int-java.util.concurrent.TimeUnit-)| `30000` | Keep alive interval for each client-broker-connection. | + + +### Pulsar producer properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.producer.name`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setProducerName-java.lang.String-) | | Specify the producer name. | +| [`pulsar.producer.initial.sequence.id`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setInitialSequenceId-long-) | | Specify baseline for sequence ID of this producer. | +| [`pulsar.producer.max.pending.messages`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessages-int-) | `1000` | Set the maximum size of the message queue pending to receive an acknowledgment from the broker. | +| [`pulsar.producer.max.pending.messages.across.partitions`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessagesAcrossPartitions-int-) | `50000` | Set the maximum number of pending messages across all the partitions. | +| [`pulsar.producer.batching.enabled`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingEnabled-boolean-) | `true` | Control whether automatic batching of messages is enabled for the producer. | +| [`pulsar.producer.batching.max.messages`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingMaxMessages-int-) | `1000` | The maximum number of messages in a batch. | +| [`pulsar.block.if.producer.queue.full`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBlockIfQueueFull-boolean-) | | Specify the block producer if queue is full. | +| [`pulsar.crypto.reader.factory.class.name`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setCryptoKeyReader-org.apache.pulsar.client.api.CryptoKeyReader-) | | Specify the CryptoReader-Factory(`CryptoKeyReaderFactory`) classname which allows producer to create CryptoKeyReader. | + + +### Pulsar consumer Properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.consumer.name`](/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setConsumerName-java.lang.String-) | | Specify the consumer name. | +| [`pulsar.consumer.receiver.queue.size`](/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setReceiverQueueSize-int-) | 1000 | Set the size of the consumer receiver queue. | +| [`pulsar.consumer.acknowledgments.group.time.millis`](/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#acknowledgmentGroupTime-long-java.util.concurrent.TimeUnit-) | 100 | Set the maximum amount of group time for consumers to send the acknowledgments to the broker. | +| [`pulsar.consumer.total.receiver.queue.size.across.partitions`](/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setMaxTotalReceiverQueueSizeAcrossPartitions-int-) | 50000 | Set the maximum size of the total receiver queue across partitions. | +| [`pulsar.consumer.subscription.topics.mode`](/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#subscriptionTopicsMode-Mode-) | PersistentOnly | Set the subscription topic mode for consumers. | +| [`pulsar.crypto.reader.factory.class.name`](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setCryptoKeyReader-org.apache.pulsar.client.api.CryptoKeyReader-) | | Specify the CryptoReader-Factory(`CryptoKeyReaderFactory`) classname which allows consumer to create CryptoKeyReader. | diff --git a/site2/website/versioned_docs/version-2.10.x/adaptors-spark.md b/site2/website/versioned_docs/version-2.10.x/adaptors-spark.md new file mode 100644 index 0000000000000..e14f13b5d4b07 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/adaptors-spark.md @@ -0,0 +1,91 @@ +--- +id: adaptors-spark +title: Pulsar adaptor for Apache Spark +sidebar_label: "Apache Spark" +original_id: adaptors-spark +--- + +## Spark Streaming receiver +The Spark Streaming receiver for Pulsar is a custom receiver that enables Apache [Spark Streaming](https://spark.apache.org/streaming/) to receive raw data from Pulsar. + +An application can receive data in [Resilient Distributed Dataset](https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds) (RDD) format via the Spark Streaming receiver and can process it in a variety of ways. + +### Prerequisites + +To use the receiver, include a dependency for the `pulsar-spark` library in your Java configuration. + +#### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-spark + ${pulsar.version} + + +``` + +#### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy + +def pulsarVersion = "@pulsar:version@" + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-spark', version: pulsarVersion +} + +``` + +### Usage + +Pass an instance of `SparkStreamingPulsarReceiver` to the `receiverStream` method in `JavaStreamingContext`: + +```java + + String serviceUrl = "pulsar://localhost:6650/"; + String topic = "persistent://public/default/test_src"; + String subs = "test_sub"; + + SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("Pulsar Spark Example"); + + JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(60)); + + ConsumerConfigurationData pulsarConf = new ConsumerConfigurationData(); + + Set set = new HashSet(); + set.add(topic); + pulsarConf.setTopicNames(set); + pulsarConf.setSubscriptionName(subs); + + SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationDisabled()); + + JavaReceiverInputDStream lineDStream = jsc.receiverStream(pulsarReceiver); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/examples/spark/src/main/java/org/apache/spark/streaming/receiver/example/SparkStreamingPulsarReceiverExample.java). In this example, the number of messages that contain the string "Pulsar" in received messages is counted. + +Note that if needed, other Pulsar authentication classes can be used. For example, in order to use a token during authentication the following parameters for the `SparkStreamingPulsarReceiver` constructor can be set: + +```java + +SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationToken("token:")); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/adaptors-storm.md b/site2/website/versioned_docs/version-2.10.x/adaptors-storm.md new file mode 100644 index 0000000000000..76d507164777d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/adaptors-storm.md @@ -0,0 +1,96 @@ +--- +id: adaptors-storm +title: Pulsar adaptor for Apache Storm +sidebar_label: "Apache Storm" +original_id: adaptors-storm +--- + +Pulsar Storm is an adaptor for integrating with [Apache Storm](http://storm.apache.org/) topologies. It provides core Storm implementations for sending and receiving data. + +An application can inject data into a Storm topology via a generic Pulsar spout, as well as consume data from a Storm topology via a generic Pulsar bolt. + +## Using the Pulsar Storm Adaptor + +Include dependency for Pulsar Storm Adaptor: + +```xml + + + org.apache.pulsar + pulsar-storm + ${pulsar.version} + + +``` + +## Pulsar Spout + +The Pulsar Spout allows for the data published on a topic to be consumed by a Storm topology. It emits a Storm tuple based on the message received and the `MessageToValuesMapper` provided by the client. + +The tuples that fail to be processed by the downstream bolts will be re-injected by the spout with an exponential backoff, within a configurable timeout (the default is 60 seconds) or a configurable number of retries, whichever comes first, after which it is acknowledged by the consumer. Here's an example construction of a spout: + +```java + +MessageToValuesMapper messageToValuesMapper = new MessageToValuesMapper() { + + @Override + public Values toValues(Message msg) { + return new Values(new String(msg.getData())); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + declarer.declare(new Fields("string")); + } +}; + +// Configure a Pulsar Spout +PulsarSpoutConfiguration spoutConf = new PulsarSpoutConfiguration(); +spoutConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +spoutConf.setTopic("persistent://my-property/usw/my-ns/my-topic1"); +spoutConf.setSubscriptionName("my-subscriber-name1"); +spoutConf.setMessageToValuesMapper(messageToValuesMapper); + +// Create a Pulsar Spout +PulsarSpout spout = new PulsarSpout(spoutConf); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/pulsar-storm/src/test/java/org/apache/pulsar/storm/PulsarSpoutTest.java). + +## Pulsar Bolt + +The Pulsar bolt allows data in a Storm topology to be published on a topic. It publishes messages based on the Storm tuple received and the `TupleToMessageMapper` provided by the client. + +A partitioned topic can also be used to publish messages on different topics. In the implementation of the `TupleToMessageMapper`, a "key" will need to be provided in the message which will send the messages with the same key to the same topic. Here's an example bolt: + +```java + +TupleToMessageMapper tupleToMessageMapper = new TupleToMessageMapper() { + + @Override + public TypedMessageBuilder toMessage(TypedMessageBuilder msgBuilder, Tuple tuple) { + String receivedMessage = tuple.getString(0); + // message processing + String processedMsg = receivedMessage + "-processed"; + return msgBuilder.value(processedMsg.getBytes()); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + } +}; + +// Configure a Pulsar Bolt +PulsarBoltConfiguration boltConf = new PulsarBoltConfiguration(); +boltConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +boltConf.setTopic("persistent://my-property/usw/my-ns/my-topic2"); +boltConf.setTupleToMessageMapper(tupleToMessageMapper); + +// Create a Pulsar Bolt +PulsarBolt bolt = new PulsarBolt(boltConf); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-brokers.md b/site2/website/versioned_docs/version-2.10.x/admin-api-brokers.md new file mode 100644 index 0000000000000..2674c7da875f9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-brokers.md @@ -0,0 +1,286 @@ +--- +id: admin-api-brokers +title: Managing Brokers +sidebar_label: "Brokers" +original_id: admin-api-brokers +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](/tools/pulsar-admin/). +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Pulsar brokers consist of two components: + +1. An HTTP server exposing a {@inject: rest:REST:/} interface administration and [topic](reference-terminology.md#topic) lookup. +2. A dispatcher that handles all Pulsar [message](reference-terminology.md#message) transfers. + +[Brokers](reference-terminology.md#broker) can be managed via: + +* The `brokers` command of the [`pulsar-admin`](/tools/pulsar-admin/) tool +* The `/admin/v2/brokers` endpoint of the admin {@inject: rest:REST:/} API +* The `brokers` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +In addition to being configurable when you start them up, brokers can also be [dynamically configured](#dynamic-broker-configuration). + +> See the [Configuration](reference-configuration.md#broker) page for a full listing of broker-specific configuration parameters. + +## Brokers resources + +### List active brokers + +Fetch all available active brokers that are serving traffic with cluster name. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list use + +``` + +``` + +broker1.use.org.com:8080 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster|operation/getActiveBrokers?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getActiveBrokers(clusterName) + +``` + + + + +```` + +### Get the information of the leader broker + +Fetch the information of the leader broker, for example, the service url. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers leader-broker + +``` + +``` + +BrokerInfo(serviceUrl=broker1.use.org.com:8080) + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/leaderBroker|operation/getLeaderBroker?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getLeaderBroker() + +``` + +For the detail of the code above, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/BrokersImpl.java#L80) + + + + +```` + +#### list of namespaces owned by a given broker + +It finds all namespaces which are owned and served by a given broker. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers namespaces use \ + --url broker1.use.org.com:8080 + +``` + +```json + +{ + "my-property/use/my-ns/0x00000000_0xffffffff": { + "broker_assignment": "shared", + "is_controlled": false, + "is_active": true + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster/:broker/ownedNamespaces|operation/getOwnedNamespaes?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getOwnedNamespaces(cluster,brokerUrl); + +``` + + + + +```` + +### Dynamic broker configuration + +One way to configure a Pulsar [broker](reference-terminology.md#broker) is to supply a [configuration](reference-configuration.md#broker) when the broker is [started up](reference-cli-tools.md#pulsar-broker). + +But since all broker configuration in Pulsar is stored in ZooKeeper, configuration values can also be dynamically updated *while the broker is running*. When you update broker configuration dynamically, ZooKeeper will notify the broker of the change and the broker will then override any existing configuration values. + +* The [`brokers`](reference-pulsar-admin.md#brokers) command for the [`pulsar-admin`](reference-pulsar-admin.md) tool has a variety of subcommands that enable you to manipulate a broker's configuration dynamically, enabling you to [update config values](#update-dynamic-configuration) and more. +* In the Pulsar admin {@inject: rest:REST:/} API, dynamic configuration is managed through the `/admin/v2/brokers/configuration` endpoint. + +### Update dynamic configuration + +````mdx-code-block + + + +The [`update-dynamic-config`](reference-pulsar-admin.md#brokers-update-dynamic-config) subcommand will update existing configuration. It takes two arguments: the name of the parameter and the new value using the `config` and `value` flag respectively. Here's an example for the [`brokerShutdownTimeoutMs`](reference-configuration.md#broker-brokerShutdownTimeoutMs) parameter: + +```shell + +$ pulsar-admin brokers update-dynamic-config --config brokerShutdownTimeoutMs --value 100 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/brokers/configuration/:configName/:configValue|operation/updateDynamicConfiguration?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().updateDynamicConfiguration(configName, configValue); + +``` + + + + +```` + +### List updated values + +Fetch a list of all potentially updatable configuration parameters. +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list-dynamic-config +brokerShutdownTimeoutMs + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration|operation/getDynamicConfigurationName?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getDynamicConfigurationNames(); + +``` + + + + +```` + +### List all + +Fetch a list of all parameters that have been dynamically updated. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers get-all-dynamic-config +brokerShutdownTimeoutMs:100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration/values|operation/getAllDynamicConfigurations?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getAllDynamicConfigurations(); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-clusters.md b/site2/website/versioned_docs/version-2.10.x/admin-api-clusters.md new file mode 100644 index 0000000000000..53cd43187e069 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-clusters.md @@ -0,0 +1,318 @@ +--- +id: admin-api-clusters +title: Managing Clusters +sidebar_label: "Clusters" +original_id: admin-api-clusters +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Pulsar clusters consist of one or more Pulsar [brokers](reference-terminology.md#broker), one or more [BookKeeper](reference-terminology.md#bookkeeper) +servers (aka [bookies](reference-terminology.md#bookie)), and a [ZooKeeper](https://zookeeper.apache.org) cluster that provides configuration and coordination management. + +Clusters can be managed via: + +* The `clusters` command of the [`pulsar-admin`](/tools/pulsar-admin/)) tool +* The `/admin/v2/clusters` endpoint of the admin {@inject: rest:REST:/} API +* The `clusters` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +## Clusters resources + +### Provision + +New clusters can be provisioned using the admin interface. + +> Please note that this operation requires superuser privileges. + +````mdx-code-block + + + +You can provision a new cluster using the [`create`](reference-pulsar-admin.md#clusters-create) subcommand. Here's an example: + +```shell + +$ pulsar-admin clusters create cluster-1 \ + --url http://my-cluster.org.com:8080 \ + --broker-url pulsar://my-cluster.org.com:6650 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/clusters/:cluster|operation/createCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().createCluster(clusterName, clusterData); + +``` + + + + +```` + +### Initialize cluster metadata + +When provision a new cluster, you need to initialize that cluster's [metadata](concepts-architecture-overview.md#metadata-store). When initializing cluster metadata, you need to specify all of the following: + +* The name of the cluster +* The local metadata store connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +You must initialize cluster metadata *before* starting up any [brokers](admin-api-brokers.md) that will belong to the cluster. + +> **No cluster metadata initialization through the REST API or the Java admin API** +> +> Unlike most other admin functions in Pulsar, cluster metadata initialization cannot be performed via the admin REST API +> or the admin Java client, as metadata initialization involves communicating with ZooKeeper directly. +> Instead, you can use the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool, in particular +> the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command. + +Here's an example cluster metadata initialization command: + +```shell + +bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --configuration-metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +You'll need to use `--*-tls` flags only if you're using [TLS authentication](security-tls-authentication.md) in your instance. + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#clusters-get) subcommand and specify the name of the cluster. Here's an example: + +```shell + +$ pulsar-admin clusters get cluster-1 +{ + "serviceUrl": "http://my-cluster.org.com:8080/", + "serviceUrlTls": null, + "brokerServiceUrl": "pulsar://my-cluster.org.com:6650/", + "brokerServiceUrlTls": null + "peerClusterNames": null +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters/:cluster|operation/getCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getCluster(clusterName); + +``` + + + + +```` + +### Update + +You can update the configuration for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#clusters-update) subcommand and specify new configuration values using flags. + +```shell + +$ pulsar-admin clusters update cluster-1 \ + --url http://my-cluster.org.com:4081 \ + --broker-url pulsar://my-cluster.org.com:3350 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster|operation/updateCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().updateCluster(clusterName, clusterData); + +``` + + + + +```` + +### Delete + +Clusters can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#clusters-delete) subcommand and specify the name of the cluster. + +``` + +$ pulsar-admin clusters delete cluster-1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/clusters/:cluster|operation/deleteCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().deleteCluster(clusterName); + +``` + + + + +```` + +### List + +You can fetch a list of all clusters in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#clusters-list) subcommand. + +```shell + +$ pulsar-admin clusters list +cluster-1 +cluster-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters|operation/getClusters?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getClusters(); + +``` + + + + +```` + +### Update peer-cluster data + +Peer clusters can be configured for a given cluster in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`update-peer-clusters`](reference-pulsar-admin.md#clusters-update-peer-clusters) subcommand and specify the list of peer-cluster names. + +``` + +$ pulsar-admin update-peer-clusters cluster-1 --peer-clusters cluster-2 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster/peers|operation/setPeerClusterNames?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().updatePeerClusterNames(clusterName, peerClusterList); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-functions.md b/site2/website/versioned_docs/version-2.10.x/admin-api-functions.md new file mode 100644 index 0000000000000..8274a21d68008 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-functions.md @@ -0,0 +1,830 @@ +--- +id: admin-api-functions +title: Manage Functions +sidebar_label: "Functions" +original_id: admin-api-functions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics +* apply a user-supplied processing logic to each message +* publish the results of the computation to another topic + +Functions can be managed via the following methods. + +Method | Description +---|--- +**Admin CLI** | The `functions` command of the [`pulsar-admin`](/tools/pulsar-admin/) tool. +**REST API** |The `/admin/v3/functions` endpoint of the admin {@inject: rest:REST:/} API. +**Java Admin API**| The `functions` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md). + +## Function resources + +You can perform the following operations on functions. + +### Create a function + +You can create a Pulsar function in cluster mode (deploy it on a Pulsar cluster) using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#functions-create) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --inputs test-input-topic \ + --output persistent://public/default/test-output-topic \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --jar /examples/api-examples.jar + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName|operation/registerFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +functionConfig.setProcessingGuarantees(FunctionConfig.ProcessingGuarantees.ATLEAST_ONCE); +functionConfig.setTopicsPattern(sourceTopicPattern); +functionConfig.setSubName(subscriptionName); +functionConfig.setAutoAck(true); +functionConfig.setOutput(sinkTopic); +admin.functions().createFunction(functionConfig, fileName); + +``` + + + + +```` + +### Update a function + +You can update a Pulsar function that has been deployed to a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#functions-update) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions update \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --output persistent://public/default/update-output-topic \ + # other options + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/functions/:tenant/:namespace/:functionName|operation/updateFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +UpdateOptions updateOptions = new UpdateOptions(); +updateOptions.setUpdateAuthData(updateAuthData); +admin.functions().updateFunction(functionConfig, userCodeFile, updateOptions); + +``` + + + + +```` + +### Start an instance of a function + +You can start a stopped function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Start all instances of a function + +You can start all stopped function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Stop an instance of a function + +You can stop a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Stop all instances of a function + +You can stop all function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Restart an instance of a function + +Restart a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Restart all instances of a function + +You can restart all function instances using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### List all functions + +You can list all Pulsar functions running under a specific tenant and namespace using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#functions-list) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace|operation/listFunctions?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctions(tenant, namespace); + +``` + + + + +```` + +### Delete a function + +You can delete a Pulsar function that is running on a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#functions-delete) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions delete \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/functions/:tenant/:namespace/:functionName|operation/deregisterFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().deleteFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get info about a function + +You can get information about a Pulsar function currently running in cluster mode using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#functions-get) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName|operation/getFunctionInfo?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get status of an instance of a function + +You can get the current status of a Pulsar function instance with `instance-id` using Admin CLI, REST API or Java Admin API. +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/status|operation/getFunctionInstanceStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get status of all instances of a function + +You can get the current status of a Pulsar function instance using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/status|operation/getFunctionStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName); + +``` + + + + +```` + +### Get stats of an instance of a function + +You can get the current stats of a Pulsar Function instance with `instance-id` using Admin CLI, REST API or Java admin API. +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stats|operation/getFunctionInstanceStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get stats of all instances of a function + +You can get the current stats of a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/stats|operation/getFunctionStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName); + +``` + + + + +```` + +### Trigger a function + +You can trigger a specified Pulsar function with a supplied value using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`trigger`](reference-pulsar-admin.md#functions-trigger) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --topic (the name of input topic) \ + --trigger-value \"hello pulsar\" + # or --trigger-file (the path of trigger file) + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/trigger|operation/triggerFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().triggerFunction(tenant, namespace, functionName, topic, triggerValue, triggerFile); + +``` + + + + +```` + +### Put state associated with a function + +You can put the state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`putstate`](reference-pulsar-admin.md#functions-putstate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions putstate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --state "{\"key\":\"pulsar\", \"stringValue\":\"hello pulsar\"}" + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/putFunctionState?version=@pulsar:version_number@} + + + + +```java + +TypeReference typeRef = new TypeReference() {}; +FunctionState stateRepr = ObjectMapperFactory.getThreadLocal().readValue(state, typeRef); +admin.functions().putFunctionState(tenant, namespace, functionName, stateRepr); + +``` + + + + +```` + +### Fetch state associated with a function + +You can fetch the current state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`querystate`](reference-pulsar-admin.md#functions-querystate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions querystate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --key (the key of state) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/getFunctionState?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionState(tenant, namespace, functionName, key); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-namespaces.md b/site2/website/versioned_docs/version-2.10.x/admin-api-namespaces.md new file mode 100644 index 0000000000000..eb8017a1142d0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-namespaces.md @@ -0,0 +1,1267 @@ +--- +id: admin-api-namespaces +title: Managing Namespaces +sidebar_label: "Namespaces" +original_id: admin-api-namespaces +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](/tools/pulsar-admin/). +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Pulsar [namespaces](reference-terminology.md#namespace) are logical groupings of [topics](reference-terminology.md#topic). + +Namespaces can be managed via: + +* The `namespaces` command of the [`pulsar-admin`](/tools/pulsar-admin/) tool +* The `/admin/v2/namespaces` endpoint of the admin {@inject: rest:REST:/} API +* The `namespaces` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +## Namespaces resources + +### Create namespaces + +You can create new namespaces under a given [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#namespaces-create) subcommand and specify the namespace by name: + +```shell + +$ pulsar-admin namespaces create test-tenant/test-namespace + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace|operation/createNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().createNamespace(namespace); + +``` + + + + +```` + +### Get policies + +You can fetch the current policies associated with a namespace at any time. + +````mdx-code-block + + + +Use the [`policies`](reference-pulsar-admin.md#namespaces-policies) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces policies test-tenant/test-namespace +{ + "auth_policies": { + "namespace_auth": {}, + "destination_auth": {} + }, + "replication_clusters": [], + "bundles_activated": true, + "bundles": { + "boundaries": [ + "0x00000000", + "0xffffffff" + ], + "numBundles": 1 + }, + "backlog_quota_map": {}, + "persistence": null, + "latency_stats_sample_rate": {}, + "message_ttl_in_seconds": 0, + "retention_policies": null, + "deleted": false +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace|operation/getPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPolicies(namespace); + +``` + + + + +```` + +### List namespaces + +You can list all namespaces within a given Pulsar [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#namespaces-list) subcommand and specify the tenant: + +```shell + +$ pulsar-admin namespaces list test-tenant +test-tenant/ns1 +test-tenant/ns2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant|operation/getTenantNamespaces?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaces(tenant); + +``` + + + + +```` + +### Delete namespaces + +You can delete existing namespaces from a tenant. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#namespaces-delete) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces delete test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace|operation/deleteNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().deleteNamespace(namespace); + +``` + + + + +```` + +### Configure replication clusters + +#### Set replication cluster + +You can set replication clusters for a namespace to enable Pulsar to internally replicate the published messages from one colocation facility to another. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-clusters test-tenant/ns1 \ + --clusters cl1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replication|operation/setNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceReplicationClusters(namespace, clusters); + +``` + + + + +```` + +#### Get replication cluster + +You can get the list of replication clusters for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-clusters test-tenant/cl1/ns1 + +``` + +``` + +cl2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replication|operation/getNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceReplicationClusters(namespace) + +``` + + + + +```` + +### Configure backlog quota policies + +#### Set backlog quota policies + +Backlog quota helps the broker to restrict bandwidth/storage of a namespace once it reaches a certain threshold limit. Admin can set the limit and take corresponding action after the limit is reached. + + 1. producer_request_hold: broker holds but not persists produce request payload + + 2. producer_exception: broker disconnects with the client by giving an exception + + 3. consumer_backlog_eviction: broker starts discarding backlog messages + +Backlog quota restriction can be taken care by defining restriction of backlog-quota-type: destination_storage. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-backlog-quota --limit 10G --limitTime 36000 --policy producer_request_hold test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/setBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setBacklogQuota(namespace, new BacklogQuota(limit, limitTime, policy)) + +``` + + + + +```` + +#### Get backlog quota policies + +You can get a configured backlog quota for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-backlog-quotas test-tenant/ns1 + +``` + +```json + +{ + "destination_storage": { + "limit": 10, + "policy": "producer_request_hold" + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getBacklogQuotaMap(namespace); + +``` + + + + +```` + +#### Remove backlog quota policies + +You can remove backlog quota policies for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-backlog-quota test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeBacklogQuota(namespace, backlogQuotaType) + +``` + + + + +```` + +### Configure persistence policies + +#### Set persistence policies + +Persistence policies allow users to configure persistency-level for all topic messages under a given namespace. + + - Bookkeeper-ack-quorum: Number of acks (guaranteed copies) to wait for each entry, default: 0 + + - Bookkeeper-ensemble: Number of bookies to use for a topic, default: 0 + + - Bookkeeper-write-quorum: How many writes to make of each entry, default: 0 + + - Ml-mark-delete-max-rate: Throttling rate of mark-delete operation (0 means no throttle), default: 0.0 + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-persistence --bookkeeper-ack-quorum 2 --bookkeeper-ensemble 3 --bookkeeper-write-quorum 2 --ml-mark-delete-max-rate 0 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setPersistence(namespace,new PersistencePolicies(bookkeeperEnsemble, bookkeeperWriteQuorum,bookkeeperAckQuorum,managedLedgerMaxMarkDeleteRate)) + +``` + + + + +```` + +#### Get persistence policies + +You can get the configured persistence policies of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-persistence test-tenant/ns1 + +``` + +```json + +{ + "bookkeeperEnsemble": 3, + "bookkeeperWriteQuorum": 2, + "bookkeeperAckQuorum": 2, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPersistence(namespace) + +``` + + + + +```` + +### Configure namespace bundles + +#### Unload namespace bundles + +The namespace bundle is a virtual group of topics which belong to the same namespace. If the broker gets overloaded with the number of bundles, this command can help unload a bundle from that broker, so it can be served by some other less-loaded brokers. The namespace bundle ID ranges from 0x00000000 to 0xffffffff. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces unload --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/unload|operation/unloadNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unloadNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +#### Split namespace bundles + +One namespace bundle can contain multiple topics but can be served by only one broker. If a single bundle is creating an excessive load on a broker, an admin can split the bundle using the command below, permitting one or more of the new bundles to be unloaded, thus balancing the load across the brokers. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces split-bundle --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/split|operation/splitNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().splitNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +### Configure message TTL + +#### Set message-ttl + +You can configure the time to live (in seconds) duration for messages. In the example below, the message-ttl is set as 100s. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-message-ttl --messageTTL 100 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, messageTTL) + +``` + + + + +```` + +#### Get message-ttl + +When the message-ttl for a namespace is set, you can use the command below to get the configured value. This example comtinues the example of the command `set message-ttl`, so the returned value is 100(s). + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-message-ttl test-tenant/ns1 + +``` + +``` + +100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + +``` + +100 + +``` + + + + +```` + +#### Remove message-ttl + +Remove a message TTL of the configured namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-message-ttl test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + + + + +```` + + +### Clear backlog + +#### Clear namespace backlog + +It clears all message backlog for all the topics that belong to a specific namespace. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --sub my-subscription test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/clearBacklog|operation/clearNamespaceBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBacklogForSubscription(namespace, subscription) + +``` + + + + +```` + +#### Clear bundle backlog + +It clears all message backlog for all the topics that belong to a specific NamespaceBundle. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --bundle 0x00000000_0xffffffff --sub my-subscription test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/:bundle/clearBacklog|operation/clearNamespaceBundleBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBundleBacklogForSubscription(namespace, bundle, subscription) + +``` + + + + +```` + +### Configure retention + +#### Set retention + +Each namespace contains multiple topics and the retention size (storage size) of each topic should not exceed a specific threshold or it should be stored for a certain period. This command helps configure the retention size and time of topics in a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-retention --size 100 --time 10 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setRetention(namespace, new RetentionPolicies(retentionTimeInMin, retentionSizeInMB)) + +``` + + + + +```` + +#### Get retention + +It shows retention information of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-retention test-tenant/ns1 + +``` + +```json + +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 100 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getRetention(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for topics + +#### Set dispatch throttling for topics + +It sets message dispatch rate for all the topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +:::note + +- If neither `clusterDispatchRate` nor `topicDispatchRate` is configured, dispatch throttling is disabled. +- If `topicDispatchRate` is not configured, `clusterDispatchRate` takes effect. +- If `topicDispatchRate` is configured, `topicDispatchRate` takes effect. + +::: + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for topics + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for subscription + +#### Set dispatch throttling for subscription + +It sets message dispatch rate for all the subscription of topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-subscription-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setSubscriptionDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for subscription + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-subscription-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getSubscriptionDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for replicator + +#### Set dispatch throttling for replicator + +It sets message dispatch rate for all the replicator between replication clusters under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-replicator-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setReplicatorDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for replicator + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-replicator-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getReplicatorDispatchRate(namespace) + +``` + + + + +```` + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +It shows configured `deduplicationSnapshotInterval` for a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +Set configured `deduplicationSnapshotInterval` for a namespace. Each topic under the namespace will take a deduplication snapshot according to this interval. +`brokerDeduplicationEnabled` must be set to `true` for this property to take effect. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-deduplication-snapshot-interval test-tenant/ns1 --interval 1000 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDeduplicationSnapshotInterval(namespace, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +Remove configured `deduplicationSnapshotInterval` of a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +### Namespace isolation + +You can use the [Pulsar isolation policy](administration-isolation.md) to allocate resources (broker and bookie) for a namespace. + +### Unload namespaces from a broker + +You can unload a namespace, or a [namespace bundle](reference-terminology.md#namespace-bundle), from the Pulsar [broker](reference-terminology.md#broker) that is currently responsible for it. + +#### pulsar-admin + +Use the [`unload`](reference-pulsar-admin.md#unload) subcommand of the [`namespaces`](reference-pulsar-admin.md#namespaces) command. + +````mdx-code-block + + + +```shell + +$ pulsar-admin namespaces unload my-tenant/my-ns + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/unload|operation/unloadNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unload(namespace) + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-non-partitioned-topics.md b/site2/website/versioned_docs/version-2.10.x/admin-api-non-partitioned-topics.md new file mode 100644 index 0000000000000..e6347bb8c363a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-non-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-partitioned-topics +title: Managing non-partitioned topics +sidebar_label: "Non-partitioned topics" +original_id: admin-api-non-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-non-persistent-topics.md b/site2/website/versioned_docs/version-2.10.x/admin-api-non-persistent-topics.md new file mode 100644 index 0000000000000..3126a6494c715 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-non-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-persistent-topics +title: Managing non-persistent topics +sidebar_label: "Non-Persistent topics" +original_id: admin-api-non-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-overview.md b/site2/website/versioned_docs/version-2.10.x/admin-api-overview.md new file mode 100644 index 0000000000000..408f1943fff18 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-overview.md @@ -0,0 +1,144 @@ +--- +id: admin-api-overview +title: Pulsar admin interface +sidebar_label: "Overview" +original_id: admin-api-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +The Pulsar admin interface enables you to manage all important entities in a Pulsar instance, such as tenants, topics, and namespaces. + +You can interact with the admin interface via: + +- The `pulsar-admin` CLI tool, which is available in the `bin` folder of your Pulsar installation: + + ```shell + + bin/pulsar-admin + + ``` + + > **Important** + > + > For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](/tools/pulsar-admin/). + +- HTTP calls, which are made against the admin {@inject: rest:REST:/} API provided by Pulsar brokers. For some RESTful APIs, they might be redirected to the owner brokers for serving with [`307 Temporary Redirect`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/307), hence the HTTP callers should handle `307 Temporary Redirect`. If you use `curl` commands, you should specify `-L` to handle redirections. + + > **Important** + > + > For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. + +- A Java client interface. + + > **Important** + > + > For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +> **The REST API is the admin interface**. Both the `pulsar-admin` CLI tool and the Java client use the REST API. If you implement your own admin interface client, you should use the REST API. + +## Admin setup + +Each of the three admin interfaces (the `pulsar-admin` CLI tool, the {@inject: rest:REST:/} API, and the [Java admin API](/api/admin)) requires some special setup if you have enabled authentication in your Pulsar instance. + +````mdx-code-block + + + +If you have enabled authentication, you need to provide an auth configuration to use the `pulsar-admin` tool. By default, the configuration for the `pulsar-admin` tool is in the [`conf/client.conf`](reference-configuration.md#client) file. The following are the available parameters: + +|Name|Description|Default| +|----|-----------|-------| +|webServiceUrl|The web URL for the cluster.|http://localhost:8080/| +|brokerServiceUrl|The Pulsar protocol URL for the cluster.|pulsar://localhost:6650/| +|authPlugin|The authentication plugin.| | +|authParams|The authentication parameters for the cluster, as a comma-separated string.| | +|useTls|Whether or not TLS authentication will be enforced in the cluster.|false| +|tlsAllowInsecureConnection|Accept untrusted TLS certificate from client.|false| +|tlsTrustCertsFilePath|Path for the trusted TLS certificate file.| | + + + + +You can find details for the REST API exposed by Pulsar brokers in this {@inject: rest:document:/}. + + + + +To use the Java admin API, instantiate a {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object, and specify a URL for a Pulsar broker and a {@inject: javadoc:PulsarAdminBuilder:/admin/org/apache/pulsar/client/admin/PulsarAdminBuilder}. The following is a minimal example using `localhost`: + +```java + +String url = "http://localhost:8080"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + +If you use multiple brokers, you can use multi-host like Pulsar service. For example, + +```java + +String url = "http://localhost:8080,localhost:8081,localhost:8082"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + + + + +```` + +## How to define Pulsar resource names when running Pulsar in Kubernetes +If you run Pulsar Functions or connectors on Kubernetes, you need to follow Kubernetes naming convention to define the names of your Pulsar resources, whichever admin interface you use. + +Kubernetes requires a name that can be used as a DNS subdomain name as defined in [RFC 1123](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). Pulsar supports more legal characters than Kubernetes naming convention. If you create a Pulsar resource name with special characters that are not supported by Kubernetes (for example, including colons in a Pulsar namespace name), Kubernetes runtime translates the Pulsar object names into Kubernetes resource labels which are in RFC 1123-compliant forms. Consequently, you can run functions or connectors using Kubernetes runtime. The rules for translating Pulsar object names into Kubernetes resource labels are as below: + +- Truncate to 63 characters + +- Replace the following characters with dashes (-): + + - Non-alphanumeric characters + + - Underscores (_) + + - Dots (.) + +- Replace beginning and ending non-alphanumeric characters with 0 + +:::tip + +- If you get an error in translating Pulsar object names into Kubernetes resource labels (for example, you may have a naming collision if your Pulsar object name is too long) or want to customize the translating rules, see [customize Kubernetes runtime](functions-runtime.md#customize-kubernetes-runtime). +- For how to configure Kubernetes runtime, see [here](functions-runtime.md#configure-kubernetes-runtime). + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-packages.md b/site2/website/versioned_docs/version-2.10.x/admin-api-packages.md new file mode 100644 index 0000000000000..608dfb7587daf --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-packages.md @@ -0,0 +1,390 @@ +--- +id: admin-api-packages +title: Manage packages +sidebar_label: "Packages" +original_id: admin-api-packages +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/). +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Package managers or package-management systems automatically manage packages in a consistent manner. These tools simplify the installation tasks, upgrade process, and deletion operations for users. A package is a minimal unit that a package manager deals with. In Pulsar, packages are organized at the tenant- and namespace-level to manage Pulsar Functions and Pulsar IO connectors (i.e., source and sink). + +## What is a package? + +A package is a set of elements that the user would like to reuse in later operations. In Pulsar, a package can be a group of functions, sources, and sinks. You can define a package according to your needs. + +The package management system in Pulsar stores the data and metadata of each package (as shown in the table below) and tracks the package versions. + +|Metadata|Description| +|--|--| +|description|The description of the package.| +|contact|The contact information of a package. For example, an email address of the developer team.| +|create_time|The time when the package is created.| +|modification_time|The time when the package is lastly modified.| +|properties|A user-defined key/value map to store other information.| + +## How to use a package + +Packages can efficiently use the same set of functions and IO connectors. For example, you can use the same function, source, and sink in multiple namespaces. The main steps are: + +1. Create a package in the package manager by providing the following information: type, tenant, namespace, package name, and version. + + |Component|Description| + |-|-| + |type|Specify one of the supported package types: function, sink and source.| + |tenant|Specify the tenant where you want to create the package.| + |namespace|Specify the namespace where you want to create the package.| + |name|Specify the complete name of the package, using the format `//`.| + |version|Specify the version of the package using the format `MajorVerion.MinorVersion` in numerals.| + + The information you provide creates a URL for a package, in the format `://///`. + +2. Upload the elements to the package, i.e., the functions, sources, and sinks that you want to use across namespaces. + +3. Apply permissions to this package from various namespaces. + +Now, you can use the elements you defined in the package by calling this package from within the package manager. The package manager locates it by the URL. For example, + +``` + +sink://public/default/mysql-sink@1.0 +function://my-tenant/my-ns/my-function@0.1 +source://my-tenant/my-ns/mysql-cdc-source@2.3 + +``` + +## Package management in Pulsar + +You can use the command line tools, REST API, or the Java client to manage your package resources in Pulsar. More specifically, you can use these tools to [upload](#upload-a-package), [download](#download-a-package), and [delete](#delete-a-package) a package, [get the metadata](#get-the-metadata-of-a-package) and [update the metadata](#update-the-metadata-of-a-package) of a package, [get the versions](#list-all-versions-of-a-package) of a package, and [get all packages of a specific type under a namespace](#list-all-packages-of-a-specific-type-under-a-namespace). + +### Upload a package + +You can use the following commands to upload a package. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages upload function://public/default/example@v0.1 --path package-file --description package-description + +``` + + + + +{@inject: endpoint|POST|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/upload?version=@pulsar:version_number@} + + + + +Upload a package to the package management service synchronously. + +```java + + void upload(PackageMetadata metadata, String packageName, String path) throws PulsarAdminException; + +``` + +Upload a package to the package management service asynchronously. + +```java + + CompletableFuture uploadAsync(PackageMetadata metadata, String packageName, String path); + +``` + + + + +```` + +### Download a package + +You can use the following commands to download a package. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages download function://public/default/example@v0.1 --path package-file + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/download?version=@pulsar:version_number@} + + + + +Download a package from the package management service synchronously. + +```java + + void download(String packageName, String path) throws PulsarAdminException; + +``` + +Download a package from the package management service asynchronously. + +```java + + CompletableFuture downloadAsync(String packageName, String path); + +``` + + + + +```` + +### Delete a package + +You can use the following commands to delete a package. + +````mdx-code-block + + + +The following command deletes a package of version 0.1. + +```shell + +bin/pulsar-admin packages delete functions://public/default/example@v0.1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/delete?version=@pulsar:version_number@} + + + + +Delete a specified package synchronously. + +```java + + void delete(String packageName) throws PulsarAdminException; + +``` + +Delete a specified package asynchronously. + +```java + + CompletableFuture deleteAsync(String packageName); + +``` + + + + +```` + +### Get the metadata of a package + +You can use the following commands to get the metadate of a package. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages get-metadata function://public/default/test@v1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/getMeta?version=@pulsar:version_number@} + + + + +Get the metadata of a package synchronously. + +```java + + PackageMetadata getMetadata(String packageName) throws PulsarAdminException; + +``` + +Get the metadata of a package asynchronously. + +```java + + CompletableFuture getMetadataAsync(String packageName); + +``` + + + + +```` + +### Update the metadata of a package + +You can use the following commands to update the metadata of a package. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages update-metadata function://public/default/example@v0.1 --description update-description + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/updateMeta?version=@pulsar:version_number@} + + + + +Update the metadata of a package synchronously. + +```java + + void updateMetadata(String packageName, PackageMetadata metadata) throws PulsarAdminException; + +``` + +Update the metadata of a package asynchronously. + +```java + + CompletableFuture updateMetadataAsync(String packageName, PackageMetadata metadata); + +``` + + + + +```` + +### List all versions of a package + +You can use the following commands to list all versions of a package. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages list-versions type://tenant/namespace/packageName + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName|operation/listPackageVersion?version=@pulsar:version_number@} + + + + +List all versions of a package synchronously. + +```java + + List listPackageVersions(String packageName) throws PulsarAdminException; + +``` + +List all versions of a package asynchronously. + +```java + + CompletableFuture> listPackageVersionsAsync(String packageName); + +``` + + + + +```` + +### List all packages of a specific type under a namespace + +You can use the following commands to list all packages of a specific type under a namespace. + +````mdx-code-block + + + + +```shell + +bin/pulsar-admin packages list --type function public/default + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace|operation/listPackages?version=@pulsar:version_number@} + + + + +List all packages of a specific type under a namespace synchronously. + +```java + + List listPackages(String type, String namespace) throws PulsarAdminException; + +``` + +List all packages of a specific type under a namespace asynchronously. + +```java + + CompletableFuture> listPackagesAsync(String type, String namespace); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-partitioned-topics.md b/site2/website/versioned_docs/version-2.10.x/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000..5ce182282e032 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: "Partitioned topics" +original_id: admin-api-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-permissions.md b/site2/website/versioned_docs/version-2.10.x/admin-api-permissions.md new file mode 100644 index 0000000000000..5ace9d573bdaa --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-permissions.md @@ -0,0 +1,189 @@ +--- +id: admin-api-permissions +title: Managing permissions +sidebar_label: "Permissions" +original_id: admin-api-permissions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Pulsar allows you to grant namespace-level or topic-level permission to users. + +- If you grant a namespace-level permission to a user, then the user can access all the topics under the namespace. + +- If you grant a topic-level permission to a user, then the user can access only the topic. + +The chapters below demonstrate how to grant namespace-level permissions to users. For how to grant topic-level permissions to users, see [manage topics](admin-api-topics.md/#grant-permission). + +## Grant permissions + +You can grant permissions to specific roles for lists of operations such as `produce` and `consume`. + +````mdx-code-block + + + +Use the [`grant-permission`](reference-pulsar-admin.md#grant-permission) subcommand and specify a namespace, actions using the `--actions` flag, and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role admin10 + +``` + +Wildcard authorization can be performed when `authorizationAllowWildcardsMatching` is set to `true` in `broker.conf`. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.role.*' + +``` + +Then, roles `my.role.1`, `my.role.2`, `my.role.foo`, `my.role.bar`, etc. can produce and consume. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role '*.role.my' + +``` + +Then, roles `1.role.my`, `2.role.my`, `foo.role.my`, `bar.role.my`, etc. can produce and consume. + +**Note**: A wildcard matching works at **the beginning or end of the role name only**. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.*.role' + +``` + +In this case, only the role `my.*.role` has permissions. +Roles `my.1.role`, `my.2.role`, `my.foo.role`, `my.bar.role`, etc. **cannot** produce and consume. + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/grantPermissionOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().grantPermissionOnNamespace(namespace, role, getAuthActions(actions)); + +``` + + + + +```` + +## Get permissions + +You can see which permissions have been granted to which roles in a namespace. + +````mdx-code-block + + + +Use the [`permissions`](reference-pulsar-admin#permissions) subcommand and specify a namespace: + +```shell + +$ pulsar-admin namespaces permissions test-tenant/ns1 +{ + "admin10": [ + "produce", + "consume" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/permissions|operation/getPermissions?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPermissions(namespace); + +``` + + + + +```` + +## Revoke permissions + +You can revoke permissions from specific roles, which means that those roles will no longer have access to the specified namespace. + +````mdx-code-block + + + +Use the [`revoke-permission`](reference-pulsar-admin.md#revoke-permission) subcommand and specify a namespace and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces revoke-permission test-tenant/ns1 \ + --role admin10 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/revokePermissionsOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().revokePermissionsOnNamespace(namespace, role); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-persistent-topics.md b/site2/website/versioned_docs/version-2.10.x/admin-api-persistent-topics.md new file mode 100644 index 0000000000000..50d135b72f542 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-persistent-topics +title: Managing persistent topics +sidebar_label: "Persistent topics" +original_id: admin-api-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-schemas.md b/site2/website/versioned_docs/version-2.10.x/admin-api-schemas.md new file mode 100644 index 0000000000000..9ffe21f5b0f75 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-schemas.md @@ -0,0 +1,7 @@ +--- +id: admin-api-schemas +title: Managing Schemas +sidebar_label: "Schemas" +original_id: admin-api-schemas +--- + diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-tenants.md b/site2/website/versioned_docs/version-2.10.x/admin-api-tenants.md new file mode 100644 index 0000000000000..e962ed851e4f0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-tenants.md @@ -0,0 +1,242 @@ +--- +id: admin-api-tenants +title: Managing Tenants +sidebar_label: "Tenants" +original_id: admin-api-tenants +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Tenants, like namespaces, can be managed using the [admin API](admin-api-overview.md). There are currently two configurable aspects of tenants: + +* Admin roles +* Allowed clusters + +## Tenant resources + +### List + +You can list all of the tenants associated with an [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#tenants-list) subcommand. + +```shell + +$ pulsar-admin tenants list +my-tenant-1 +my-tenant-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants|operation/getTenants?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenants(); + +``` + + + + +```` + +### Create + +You can create a new tenant. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#tenants-create) subcommand: + +```shell + +$ pulsar-admin tenants create my-tenant + +``` + +When creating a tenant, you can optionally assign admin roles using the `-r`/`--admin-roles` +flag, and clusters using the `-c`/`--allowed-clusters` flag. You can specify multiple values +as a comma-separated list. Here are some examples: + +```shell + +$ pulsar-admin tenants create my-tenant \ + --admin-roles role1,role2,role3 \ + --allowed-clusters cluster1 + +$ pulsar-admin tenants create my-tenant \ + -r role1 + -c cluster1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/tenants/:tenant|operation/createTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().createTenant(tenantName, tenantInfo); + +``` + + + + +```` + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing tenant at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#tenants-get) subcommand and specify the name of the tenant. Here's an example: + +```shell + +$ pulsar-admin tenants get my-tenant +{ + "adminRoles": [ + "admin1", + "admin2" + ], + "allowedClusters": [ + "cl1", + "cl2" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants/:tenant|operation/getTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenantInfo(tenantName); + +``` + + + + +```` + +### Delete + +Tenants can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#tenants-delete) subcommand and specify the name of the tenant. + +```shell + +$ pulsar-admin tenants delete my-tenant + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/tenants/:tenant|operation/deleteTenant?version=@pulsar:version_number@} + + + + +```java + +admin.Tenants().deleteTenant(tenantName); + +``` + + + + +```` + +### Update + +You can update a tenant's configuration. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#tenants-update) subcommand. + +```shell + +$ pulsar-admin tenants update my-tenant + +``` + + + + +{@inject: endpoint|POST|/admin/v2/tenants/:tenant|operation/updateTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().updateTenant(tenantName, tenantInfo); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/admin-api-topics.md b/site2/website/versioned_docs/version-2.10.x/admin-api-topics.md new file mode 100644 index 0000000000000..90baa7a120ee6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/admin-api-topics.md @@ -0,0 +1,2472 @@ +--- +id: admin-api-topics +title: Manage topics +sidebar_label: "Topics" +original_id: admin-api-topics +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](/api/admin/). + +Pulsar has persistent and non-persistent topics. Persistent topic is a logical endpoint for publishing and consuming messages. The topic name structure for persistent topics is: + +```shell + +persistent://tenant/namespace/topic + +``` + +Non-persistent topics are used in applications that only consume real-time published messages and do not need persistent guarantee. In this way, it reduces message-publish latency by removing overhead of persisting messages. The topic name structure for non-persistent topics is: + +```shell + +non-persistent://tenant/namespace/topic + +``` + +## Manage topic resources +Whether it is persistent or non-persistent topic, you can obtain the topic resources through `pulsar-admin` tool, REST API and Java. + +:::note + +In REST API, `:schema` stands for persistent or non-persistent. `:tenant`, `:namespace`, `:x` are variables, replace them with the real tenant, namespace, and `x` names when using them. +Take {@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} as an example, to get the list of persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/persistent/my-tenant/my-namespace`. To get the list of non-persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/non-persistent/my-tenant/my-namespace`. + +::: + +### List of topics + +You can get the list of topics under a given namespace in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list \ + my-tenant/my-namespace + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +String namespace = "my-tenant/my-namespace"; +admin.topics().getList(namespace); + +``` + + + + +```` + +### Grant permission + +You can grant permissions on a client role to perform specific actions on a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics grant-permission \ + --actions produce,consume --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/grantPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +Set actions = Sets.newHashSet(AuthAction.produce, AuthAction.consume); +admin.topics().grantPermission(topic, role, actions); + +``` + + + + +```` + +### Get permission + +You can fetch permission in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics permissions \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/permissions|operation/getPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPermissions(topic); + +``` + + + + +```` + +### Revoke permission + +You can revoke a permission granted on a client role in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics revoke-permission \ + --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/revokePermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +admin.topics().revokePermissions(topic, role); + +``` + + + + +```` + +### Delete topic + +You can delete a topic in the following ways. You cannot delete a topic if any active subscription or producers is connected to the topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics delete \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().delete(topic); + +``` + + + + +```` + +### Unload topic + +You can unload a topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics unload \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/unload|operation/unloadTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().unload(topic); + +``` + + + + +```` + +### Get stats + +You can check the following statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates (msg/s). + + - **msgThroughputIn**: The sum of all local and replication publishers' publish rates (bytes/s). + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates(msg/s). + + - **msgThroughputOut**: The sum of all local and replication consumers' dispatch rates (bytes/s). + + - **averageMsgSize**: The average size (in bytes) of messages published within the last interval. + + - **storageSize**: The sum of the ledgers' storage size for this topic. The space used to store the messages for the topic. + + - **earliestMsgPublishTimeInBacklogs**: The publish time of the earliest message in the backlog (ms). + + - **bytesInCounter**: Total bytes published to the topic. + + - **msgInCounter**: Total messages published to the topic. + + - **bytesOutCounter**: Total bytes delivered to consumers. + + - **msgOutCounter**: Total messages delivered to consumers. + + - **msgChunkPublished**: Topic has chunked message published on it. + + - **backlogSize**: Estimated total unconsumed or backlog size (in bytes). + + - **offloadedStorageSize**: Space used to store the offloaded messages for the topic (in bytes). + + - **waitingPublishers**: The number of publishers waiting in a queue in exclusive access mode. + + - **deduplicationStatus**: The status of message deduplication for the topic. + + - **topicEpoch**: The topic epoch or empty if not set. + + - **nonContiguousDeletedMessagesRanges**: The number of non-contiguous deleted messages ranges. + + - **nonContiguousDeletedMessagesRangesSerializedSize**: The serialized size of non-contiguous deleted messages ranges. + + - **publishers**: The list of all local publishers into the topic. The list ranges from zero to thousands. + + - **accessMode**: The type of access to the topic that the producer requires. + + - **msgRateIn**: The total rate of messages (msg/s) published by this publisher. + + - **msgThroughputIn**: The total throughput (bytes/s) of the messages published by this publisher. + + - **averageMsgSize**: The average message size in bytes from this publisher within the last interval. + + - **chunkedMessageRate**: The total rate of chunked messages published by this publisher. + + - **producerId**: The internal identifier for this producer on this topic. + + - **producerName**: The internal identifier for this producer, generated by the client library. + + - **address**: The IP address and source port for the connection of this producer. + + - **connectedSince**: The timestamp when this producer is created or reconnected last time. + + - **clientVersion**: The client library version of this producer. + + - **metadata**: Metadata (key/value strings) associated with this publisher. + + - **subscriptions**: The list of all local subscriptions to the topic. + + - **my-subscription**: The name of this subscription. It is defined by the client. + + - **msgRateOut**: The total rate of messages (msg/s) delivered on this subscription. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered on this subscription. + + - **msgBacklog**: The number of messages in the subscription backlog. + + - **type**: The subscription type. + + - **msgRateExpired**: The rate at which messages were discarded instead of dispatched from this subscription due to TTL. + + - **lastExpireTimestamp**: The timestamp of the last message expire execution. + + - **lastConsumedFlowTimestamp**: The timestamp of the last flow command received. + + - **lastConsumedTimestamp**: The latest timestamp of all the consumed timestamp of the consumers. + + - **lastAckedTimestamp**: The latest timestamp of all the acked timestamp of the consumers. + + - **bytesOutCounter**: Total bytes delivered to consumer. + + - **msgOutCounter**: Total messages delivered to consumer. + + - **msgRateRedeliver**: Total rate of messages redelivered on this subscription (msg/s). + + - **chunkedMessageRate**: Chunked message dispatch rate. + + - **backlogSize**: Size of backlog for this subscription (in bytes). + + - **earliestMsgPublishTimeInBacklog**: The publish time of the earliest message in the backlog for the subscription (ms). + + - **msgBacklogNoDelayed**: Number of messages in the subscription backlog that do not contain the delay messages. + + - **blockedSubscriptionOnUnackedMsgs**: Flag to verify if a subscription is blocked due to reaching threshold of unacked messages. + + - **msgDelayed**: Number of delayed messages currently being tracked. + + - **unackedMessages**: Number of unacknowledged messages for the subscription, where an unacknowledged message is one that has been sent to a consumer but not yet acknowledged. This field is only meaningful when using a subscription that tracks individual message acknowledgement. + + - **activeConsumerName**: The name of the consumer that is active for single active consumer subscriptions. For example, failover or exclusive. + + - **totalMsgExpired**: Total messages expired on this subscription. + + - **lastMarkDeleteAdvancedTimestamp**: Last MarkDelete position advanced timestamp. + + - **durable**: Whether the subscription is durable or ephemeral (for example, from a reader). + + - **replicated**: Mark that the subscription state is kept in sync across different regions. + + - **allowOutOfOrderDelivery**: Whether out of order delivery is allowed on the Key_Shared subscription. + + - **keySharedMode**: Whether the Key_Shared subscription mode is AUTO_SPLIT or STICKY. + + - **consumersAfterMarkDeletePosition**: This is for Key_Shared subscription to get the recentJoinedConsumers in the Key_Shared subscription. + + - **nonContiguousDeletedMessagesRanges**: The number of non-contiguous deleted messages ranges. + + - **nonContiguousDeletedMessagesRangesSerializedSize**: The serialized size of non-contiguous deleted messages ranges. + + - **consumers**: The list of connected consumers for this subscription. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the consumer. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the consumer. + + - **consumerName**: The internal identifier for this consumer, generated by the client library. + + - **availablePermits**: The number of messages that the consumer has space for in the client library's listen queue. `0` means the client library's queue is full and `receive()` isn't being called. A non-zero value means this consumer is ready for dispatched messages. + + - **unackedMessages**: The number of unacknowledged messages for the consumer, where an unacknowledged message is one that has been sent to the consumer but not yet acknowledged. This field is only meaningful when using a subscription that tracks individual message acknowledgement. + + - **blockedConsumerOnUnackedMsgs**: The flag used to verify if the consumer is blocked due to reaching threshold of the unacknowledged messages. + + - **lastConsumedTimestamp**: The timestamp when the consumer reads a message the last time. + + - **lastAckedTimestamp**: The timestamp when the consumer acknowledges a message the last time. + + - **address**: The IP address and source port for the connection of this consumer. + + - **connectedSince**: The timestamp when this consumer is created or reconnected last time. + + - **clientVersion**: The client library version of this consumer. + + - **bytesOutCounter**: Total bytes delivered to consumer. + + - **msgOutCounter**: Total messages delivered to consumer. + + - **msgRateRedeliver**: Total rate of messages redelivered by this consumer (msg/s). + + - **chunkedMessageRate**: The total rate of chunked messages delivered to this consumer. + + - **avgMessagesPerEntry**: Number of average messages per entry for the consumer consumed. + + - **readPositionWhenJoining**: The read position of the cursor when the consumer joining. + + - **keyHashRanges**: Hash ranges assigned to this consumer if is Key_Shared sub mode. + + - **metadata**: Metadata (key/value strings) associated with this consumer. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **msgRateIn**: The total rate (msg/s) of messages received from the remote cluster. + + - **msgThroughputIn**: The total throughput (bytes/s) received from the remote cluster. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the replication-subscriber. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the replication-subscriber. + + - **msgRateExpired**: The total rate of messages (msg/s) expired. + + - **replicationBacklog**: The number of messages pending to be replicated to remote cluster. + + - **connected**: Whether the outbound replicator is connected. + + - **replicationDelayInSeconds**: How long the oldest message has been waiting to be sent through the connection, if connected is `true`. + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker. + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **outboundConnection**: The address of the outbound replication connection. + + - **outboundConnectedSince**: The timestamp of establishing outbound connection. + +The following is an example of a topic status. + +```json + +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesInCounter" : 504, + "msgInCounter" : 9, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "averageMsgSize" : 0.0, + "msgChunkPublished" : false, + "storageSize" : 504, + "backlogSize" : 0, + "earliestMsgPublishTimeInBacklogs": 0, + "offloadedStorageSize" : 0, + "publishers" : [ { + "accessMode" : "Shared", + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "averageMsgSize" : 0.0, + "chunkedMessageRate" : 0.0, + "producerId" : 0, + "metadata" : { }, + "address" : "/127.0.0.1:65402", + "connectedSince" : "2021-06-09T17:22:55.913+08:00", + "clientVersion" : "2.9.0-SNAPSHOT", + "producerName" : "standalone-1-0" + } ], + "waitingPublishers" : 0, + "subscriptions" : { + "sub-demo" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "msgRateRedeliver" : 0.0, + "chunkedMessageRate" : 0, + "msgBacklog" : 0, + "backlogSize" : 0, + "earliestMsgPublishTimeInBacklog": 0, + "msgBacklogNoDelayed" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Exclusive", + "activeConsumerName" : "20b81", + "msgRateExpired" : 0.0, + "totalMsgExpired" : 0, + "lastExpireTimestamp" : 0, + "lastConsumedFlowTimestamp" : 1623230565356, + "lastConsumedTimestamp" : 1623230583946, + "lastAckedTimestamp" : 1623230584033, + "lastMarkDeleteAdvancedTimestamp" : 1623230584033, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "msgRateRedeliver" : 0.0, + "chunkedMessageRate" : 0.0, + "consumerName" : "20b81", + "availablePermits" : 959, + "unackedMessages" : 0, + "avgMessagesPerEntry" : 314, + "blockedConsumerOnUnackedMsgs" : false, + "lastAckedTimestamp" : 1623230584033, + "lastConsumedTimestamp" : 1623230583946, + "metadata" : { }, + "address" : "/127.0.0.1:65172", + "connectedSince" : "2021-06-09T17:22:45.353+08:00", + "clientVersion" : "2.9.0-SNAPSHOT" + } ], + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition" : { }, + "nonContiguousDeletedMessagesRanges" : 0, + "nonContiguousDeletedMessagesRangesSerializedSize" : 0, + "durable" : true, + "replicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled", + "nonContiguousDeletedMessagesRanges" : 0, + "nonContiguousDeletedMessagesRangesSerializedSize" : 0 +} + +``` + +To get the status of a topic, you can use the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getStats(topic); + +``` + + + + +```` + +### Get internal stats + +You can get the detailed statistics of a topic. + + - **entriesAddedCounter**: Messages published since this broker loaded this topic. + + - **numberOfEntries**: The total number of messages being tracked. + + - **totalSize**: The total storage size in bytes of all messages. + + - **currentLedgerEntries**: The count of messages written to the ledger that is currently open for writing. + + - **currentLedgerSize**: The size in bytes of messages written to the ledger that is currently open for writing. + + - **lastLedgerCreatedTimestamp**: The time when the last ledger is created. + + - **lastLedgerCreationFailureTimestamp:** The time when the last ledger failed. + + - **waitingCursorsCount**: The number of cursors that are "caught up" and waiting for a new message to be published. + + - **pendingAddEntriesCount**: The number of messages that complete (asynchronous) write requests. + + - **lastConfirmedEntry**: The ledgerid:entryid of the last message that is written successfully. If the entryid is `-1`, then the ledger is open, yet no entries are written. + + - **state**: The state of this ledger for writing. The state `LedgerOpened` means that a ledger is open for saving published messages. + + - **ledgers**: The ordered list of all ledgers for this topic holding messages. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **schemaLedgers**: The ordered list of all ledgers for this topic schema. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **compactedLedger**: The ledgers holding un-acked messages after topic compaction. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. The value is `false` for the compacted topic ledger. + + - **cursors**: The list of all cursors on this topic. Each subscription in the topic stats has a cursor. + + - **markDeletePosition**: All messages before the markDeletePosition are acknowledged by the subscriber. + + - **readPosition**: The latest position of subscriber for reading message. + + - **waitingReadOp**: This is true when the subscription has read the latest message published to the topic and is waiting for new messages to be published. + + - **pendingReadOps**: The counter for how many outstanding read requests to the BookKeepers in progress. + + - **messagesConsumedCounter**: The number of messages this cursor has acked since this broker loaded this topic. + + - **cursorLedger**: The ledger being used to persistently store the current markDeletePosition. + + - **cursorLedgerLastEntry**: The last entryid used to persistently store the current markDeletePosition. + + - **individuallyDeletedMessages**: If acknowledges are being done out of order, the ranges of messages acknowledged between the markDeletePosition and the read-position shows. + + - **lastLedgerSwitchTimestamp**: The last time the cursor ledger is rolled over. + + - **state**: The state of the cursor ledger: `Open` means you have a cursor ledger for saving updates of the markDeletePosition. + +The following is an example of the detailed statistics of a topic. + +```json + +{ + "entriesAddedCounter":0, + "numberOfEntries":0, + "totalSize":0, + "currentLedgerEntries":0, + "currentLedgerSize":0, + "lastLedgerCreatedTimestamp":"2021-01-22T21:12:14.868+08:00", + "lastLedgerCreationFailureTimestamp":null, + "waitingCursorsCount":0, + "pendingAddEntriesCount":0, + "lastConfirmedEntry":"3:-1", + "state":"LedgerOpened", + "ledgers":[ + { + "ledgerId":3, + "entries":0, + "size":0, + "offloaded":false, + "metadata":null + } + ], + "cursors":{ + "test":{ + "markDeletePosition":"3:-1", + "readPosition":"3:-1", + "waitingReadOp":false, + "pendingReadOps":0, + "messagesConsumedCounter":0, + "cursorLedger":4, + "cursorLedgerLastEntry":1, + "individuallyDeletedMessages":"[]", + "lastLedgerSwitchTimestamp":"2021-01-22T21:12:14.966+08:00", + "state":"Open", + "numberOfEntriesSinceFirstNotAckedMessage":0, + "totalNonContiguousDeletedMessagesRange":0, + "properties":{ + + } + } + }, + "schemaLedgers":[ + { + "ledgerId":1, + "entries":11, + "size":10, + "offloaded":false, + "metadata":null + } + ], + "compactedLedger":{ + "ledgerId":-1, + "entries":-1, + "size":-1, + "offloaded":false, + "metadata":null + } +} + +``` + +To get the internal status of a topic, you can use the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getInternalStats(topic); + +``` + + + + +```` + +### Peek messages + +You can peek a number of messages for a specific subscription of a given topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics peek-messages \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +Message ID: 315674752:0 +Properties: { "X-Pulsar-publish-time" : "2015-07-13 17:40:28.451" } +msg-payload + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/position/:messagePosition|operation/peekNthMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().peekMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Get message by ID + +You can fetch the message with the given ledger ID and entry ID in the following ways. + +````mdx-code-block + + + +```shell + +$ ./bin/pulsar-admin topics get-message-by-id \ + persistent://public/default/my-topic \ + -l 10 -e 0 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/ledger/:ledgerId/entry/:entryId|operation/getMessageById?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +long ledgerId = 10; +long entryId = 10; +admin.topics().getMessageById(topic, ledgerId, entryId); + +``` + + + + +```` + +### Examine messages + +You can examine a specific message on a topic by position relative to the earliest or the latest message. + +````mdx-code-block + + + +```shell + +./bin/pulsar-admin topics examine-messages \ + persistent://public/default/my-topic \ + -i latest -m 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/examinemessage?initialPosition=:initialPosition&messagePosition=:messagePosition|operation/examineMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().examineMessage(topic, "latest", 1); + +``` + + + + +```` + +### Get message ID + +You can get message ID published at or just after the given datetime. + +````mdx-code-block + + + +```shell + +./bin/pulsar-admin topics get-message-id \ + persistent://public/default/my-topic \ + -d 2021-06-28T19:01:17Z + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/messageid/:timestamp|operation/getMessageIdByTimestamp?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +long timestamp = System.currentTimeMillis() +admin.topics().getMessageIdByTimestamp(topic, timestamp); + +``` + + + + +```` + + +### Skip messages + +You can skip a number of messages for a specific subscription of a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip/:numMessages|operation/skipMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().skipMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Skip all messages + +You can skip all the old messages for a specific subscription of a given topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip-all \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip_all|operation/skipAllMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +admin.topics().skipAllMessages(topic, subName); + +``` + + + + +```` + +### Reset cursor + +You can reset a subscription cursor position back to the position which is recorded X minutes before. It essentially calculates time and position of cursor at X minutes before and resets it at that position. You can reset the cursor in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics reset-cursor \ + --subscription my-subscription --time 10 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/resetcursor/:timestamp|operation/resetCursor?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +long timestamp = 2342343L; +admin.topics().resetCursor(topic, subName, timestamp); + +``` + + + + +```` + +### Look up topic's owner broker + +You can locate the owner broker of the given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics lookup \ + persistent://test-tenant/ns1/tp1 \ + + "pulsar://broker1.org.com:4480" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic-domain/:tenant/:namespace/:topic|operation/lookupTopicAsync?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupDestination(topic); + +``` + + + + +```` + +### Look up partitioned topic's owner broker + +You can locate the owner broker of the given partitioned topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics partitioned-lookup \ + persistent://test-tenant/ns1/my-topic \ + + "persistent://test-tenant/ns1/my-topic-partition-0 pulsar://localhost:6650" + "persistent://test-tenant/ns1/my-topic-partition-1 pulsar://localhost:6650" + "persistent://test-tenant/ns1/my-topic-partition-2 pulsar://localhost:6650" + "persistent://test-tenant/ns1/my-topic-partition-3 pulsar://localhost:6650" + +``` + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupPartitionedTopic(topic); + +``` + +Lookup the partitioned topics sorted by broker URL + +```shell + +$ pulsar-admin topics partitioned-lookup \ + persistent://test-tenant/ns1/my-topic --sort-by-broker \ + + "pulsar://localhost:6650 [persistent://test-tenant/ns1/my-topic-partition-0, persistent://test-tenant/ns1/my-topic-partition-1, persistent://test-tenant/ns1/my-topic-partition-2, persistent://test-tenant/ns1/my-topic-partition-3]" + +``` + + + + +```` + +### Get bundle + +You can get the range of the bundle that the given topic belongs to in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics bundle-range \ + persistent://test-tenant/ns1/tp1 \ + + "0x00000000_0xffffffff" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic_domain/:tenant/:namespace/:topic/bundle|operation/getNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().getBundleRange(topic); + +``` + + + + +```` + +### Get subscriptions + +You can check all subscription names for a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics subscriptions \ + persistent://test-tenant/ns1/tp1 \ + + my-subscription + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getSubscriptions(topic); + +``` + + + + +```` + +### Last Message Id + +You can get the last committed message ID for a persistent topic. It is available since 2.3.0 release. + +````mdx-code-block + + + +```shell + +pulsar-admin topics last-message-id topic-name + +``` + + + + +{@inject: endpoint|Get|/admin/v2/:schema/:tenant/:namespace/:topic/lastMessageId|operation/getLastMessageId?version=@pulsar:version_number@} + + + + +```Java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getLastMessage(topic); + +``` + + + + +```` + +### Get backlog size + +You can get the backlog size of a single partition topic or a non-partitioned topic with a given message ID (in bytes). + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics get-backlog-size \ + -m 1:1 \ + persistent://test-tenant/ns1/tp1-partition-0 \ + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/backlogSize|operation/getBacklogSizeByMessageId?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +MessageId messageId = MessageId.earliest; +admin.topics().getBacklogSizeByMessageId(topic, messageId); + +``` + + + + +```` + + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +To get the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +To set the topic-level deduplication snapshot interval, use one of the following methods. + +> **Prerequisite** `brokerDeduplicationEnabled` must be set to `true`. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setDeduplicationSnapshotInterval(topic, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +To remove the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + + +### Configure inactive topic policies + +#### Get inactive topic policies + +To get the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/getInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInactiveTopicPolicies(topic) + +``` + + + + +```` + +#### Set inactive topic policies + +To set the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/setInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setInactiveTopicPolicies(topic, inactiveTopicPolicies) + +``` + + + + +```` + +#### Remove inactive topic policies + +To remove the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/removeInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeInactiveTopicPolicies(topic) + +``` + + + + +```` + + +### Configure offload policies + +#### Get offload policies + +To get the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-offload-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/getOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getOffloadPolicies(topic) + +``` + + + + +```` + +#### Set offload policies + +To set the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-offload-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/setOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setOffloadPolicies(topic, offloadPolicies) + +``` + + + + +```` + +#### Remove offload policies + +To remove the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-offload-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/removeOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeOffloadPolicies(topic) + +``` + + + + +```` + + +## Manage non-partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, delete and check status of non-partitioned topics. + +### Create +Non-partitioned topics must be explicitly created. When creating a new non-partitioned topic, you need to provide a name for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create non-partitioned topics in the following ways. +````mdx-code-block + + + +When you create non-partitioned topics with the [`create`](reference-pulsar-admin.md#create-3) command, you need to specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create \ + persistent://my-tenant/my-namespace/my-topic + +``` + +:::note + +When you create a non-partitioned topic with the suffix '-partition-' followed by numeric value like 'xyz-topic-partition-x' for the topic name, if a partitioned topic with same suffix 'xyz-topic-partition-y' exists, then the numeric value(x) for the non-partitioned topic must be larger than the number of partitions(y) of the partitioned topic. Otherwise, you cannot create such a non-partitioned topic. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createNonPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createNonPartitionedTopic(topicName); + +``` + + + + +```` + +### Delete +You can delete non-partitioned topics in the following ways. +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List + +You can get the list of topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +```json + +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} + +``` + +You can check the current statistics of a given topic and its connected producers and consumers in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/namespace/topic \ + --get-precise-backlog + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getStats(topic, false /* is precise backlog */); + +``` + + + + +```` + +## Manage partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, update, delete and check status of partitioned topics. + +### Create + +Partitioned topics must be explicitly created. When creating a new partitioned topic, you need to provide a name and the number of partitions for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create partitioned topics in the following ways. +````mdx-code-block + + + +When you create partitioned topics with the [`create-partitioned-topic`](reference-pulsar-admin.md#create-partitioned-topic) +command, you need to specify the topic name as an argument and the number of partitions using the `-p` or `--partitions` flag. + +```shell + +$ bin/pulsar-admin topics create-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 + +``` + +:::note + +If a non-partitioned topic with the suffix '-partition-' followed by a numeric value like 'xyz-topic-partition-10', you can not create a partitioned topic with name 'xyz-topic', because the partitions of the partitioned topic could override the existing non-partitioned topic. To create such partitioned topic, you have to delete that non-partitioned topic first. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.topics().createPartitionedTopic(topicName, numPartitions); + +``` + + + + +```` + +### Create missed partitions + +When topic auto-creation is disabled, and you have a partitioned topic without any partitions, you can use the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command to create partitions for the topic. + +````mdx-code-block + + + +You can create missed partitions with the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command and specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create-missed-partitions \ + persistent://my-tenant/my-namespace/my-topic \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createMissedPartitions?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createMissedPartitions(topicName); + +``` + + + + +```` + +### Get metadata + +Partitioned topics are associated with metadata, you can view it as a JSON object. The following metadata field is available. + +Field | Description +:-----|:------- +`partitions` | The number of partitions into which the topic is divided. + +````mdx-code-block + + + +You can check the number of partitions in a partitioned topic with the [`get-partitioned-topic-metadata`](reference-pulsar-admin.md#get-partitioned-topic-metadata) subcommand. + +```shell + +$ pulsar-admin topics get-partitioned-topic-metadata \ + persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPartitionedTopicMetadata(topicName); + +``` + + + + +```` + +### Update + +You can update the number of partitions for an existing partitioned topic *if* the topic is non-global. However, you can only add the partition number. Decrementing the number of partitions would delete the topic, which is not supported in Pulsar. + +Producers and consumers can find the newly created partitions automatically. + +````mdx-code-block + + + +You can update partitioned topics with the [`update-partitioned-topic`](reference-pulsar-admin.md#update-partitioned-topic) command. + +```shell + +$ pulsar-admin topics update-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 8 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:cluster/:namespace/:destination/partitions|operation/updatePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().updatePartitionedTopic(topic, numPartitions); + +``` + + + + +```` + +### Delete +You can delete partitioned topics with the [`delete-partitioned-topic`](reference-pulsar-admin.md#delete-partitioned-topic) command, REST API and Java. + +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:topic/:namespace/:destination/partitions|operation/deletePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List +You can get the list of partitioned topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list-partitioned-topics tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getPartitionedTopicList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getPartitionedTopicList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given partitioned topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +Note that in the subscription JSON object, `chuckedMessageRate` is deprecated. Please use `chunkedMessageRate`. Both will be sent in the JSON for now. + +```json + +{ + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesInCounter" : 270318763, + "msgInCounter" : 252489, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "averageMsgSize" : 1070.926056966454, + "msgChunkPublished" : false, + "storageSize" : 270316646, + "backlogSize" : 200921133, + "publishers" : [ { + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "averageMsgSize" : 1070.3333333333333, + "chunkedMessageRate" : 0.0, + "producerId" : 0 + } ], + "subscriptions" : { + "test" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "msgRateRedeliver" : 0.0, + "chuckedMessageRate" : 0, + "chunkedMessageRate" : 0, + "msgBacklog" : 144318, + "msgBacklogNoDelayed" : 144318, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "msgRateExpired" : 0.0, + "lastExpireTimestamp" : 0, + "lastConsumedFlowTimestamp" : 0, + "lastConsumedTimestamp" : 0, + "lastAckedTimestamp" : 0, + "consumers" : [ ], + "isDurable" : true, + "isReplicated" : false + } + }, + "replication" : { }, + "metadata" : { + "partitions" : 3 + }, + "partitions" : { } +} + +``` + +You can check the current statistics of a given partitioned topic and its connected producers and consumers in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics partitioned-stats \ + persistent://test-tenant/namespace/topic \ + --per-partition + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitioned-stats|operation/getPartitionedStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getPartitionedStats(topic, true /* per partition */, false /* is precise backlog */); + +``` + + + + +```` + +### Internal stats + +You can check the detailed statistics of a topic. The following is an example. For description of each stats, refer to [get internal stats](#get-internal-stats). + +```json + +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} + +``` + +You can get the internal stats for the partitioned topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/namespace/topic + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInternalStats(topic); + +``` + + + + +```` + + +## Publish to partitioned topics + +By default, Pulsar topics are served by a single broker, which limits the maximum throughput of a topic. *Partitioned topics* can span multiple brokers and thus allow for higher throughput. + +You can publish to partitioned topics using Pulsar client libraries. When publishing to partitioned topics, you must specify a routing mode. If you do not specify any routing mode when you create a new producer, the round robin routing mode is used. + +### Routing mode + +You can specify the routing mode in the ProducerConfiguration object that you use to configure your producer. The routing mode determines which partition(internal topic) that each message should be published to. + +The following {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} options are available. + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer publishes messages across all partitions in round-robin policy to achieve the maximum throughput. Round-robin is not done per individual message, round-robin is set to the same boundary of batching delay to ensure that batching is effective. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer picks a single partition randomly and publishes all messages into that partition. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. +`CustomPartition` | Use custom message router implementation that is called to determine the partition for a particular message. You can create a custom routing mode by using the Java client and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +The following is an example: + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRoutingMode(MessageRoutingMode.SinglePartition) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### Custom message router + +To use a custom message router, you need to provide an implementation of the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface, which has just one `choosePartition` method: + +```java + +public interface MessageRouter extends Serializable { + int choosePartition(Message msg); +} + +``` + +The following router routes every message to partition 10: + +```java + +public class AlwaysTenRouter implements MessageRouter { + public int choosePartition(Message msg) { + return 10; + } +} + +``` + +With that implementation, you can send + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-cluster-my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRouter(new AlwaysTenRouter()) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### How to choose partitions when using a key +If a message has a key, it supersedes the round robin routing policy. The following example illustrates how to choose the partition when using a key. + +```java + +// If the message has a key, it supersedes the round robin routing policy + if (msg.hasKey()) { + return signSafeMod(hash.makeHash(msg.getKey()), topicMetadata.numPartitions()); + } + + if (isBatchingEnabled) { // if batching is enabled, choose partition on `partitionSwitchMs` boundary. + long currentMs = clock.millis(); + return signSafeMod(currentMs / partitionSwitchMs + startPtnIdx, topicMetadata.numPartitions()); + } else { + return signSafeMod(PARTITION_INDEX_UPDATER.getAndIncrement(this), topicMetadata.numPartitions()); + } + +``` + +## Manage subscriptions + +You can use [Pulsar admin API](admin-api-overview.md) to create, check, and delete subscriptions. + +### Create subscription + +You can create a subscription for a topic using one of the following methods. + +````mdx-code-block + + + + +```shell + +pulsar-admin topics create-subscription \ +--subscription my-subscription \ +persistent://test-tenant/ns1/tp1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subscription|operation/createSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.topics().createSubscription(topic, subscriptionName, MessageId.latest); + +``` + + + + +```` + +### Get subscription + +You can check all subscription names for a given topic using one of the following methods. + +````mdx-code-block + + + + +```shell + +pulsar-admin topics subscriptions \ +persistent://test-tenant/ns1/tp1 \ +my-subscription + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getSubscriptions(topic); + +``` + + + + +```` + +### Unsubscribe subscription + +When a subscription does not process messages any more, you can unsubscribe it using one of the following methods. + +````mdx-code-block + + + + +```shell + +pulsar-admin topics unsubscribe \ +--subscription my-subscription \ +persistent://test-tenant/ns1/tp1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/:topic/subscription/:subscription|operation/deleteSubscription?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.topics().deleteSubscription(topic, subscriptionName); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/administration-geo.md b/site2/website/versioned_docs/version-2.10.x/administration-geo.md new file mode 100644 index 0000000000000..2d64f0b643f1e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-geo.md @@ -0,0 +1,302 @@ +--- +id: administration-geo +title: Pulsar geo-replication +sidebar_label: "Geo-replication" +original_id: administration-geo +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +## Enable geo-replication for a namespace + +You must enable geo-replication on a [per-tenant basis](#concepts-multi-tenancy) in Pulsar. For example, you can enable geo-replication between two specific clusters only when a tenant has access to both clusters. + +Geo-replication is managed at the namespace level, which means you only need to create and configure a namespace to replicate messages between two or more provisioned clusters that a tenant can access. + +Complete the following tasks to enable geo-replication for a namespace: + +* [Enable a geo-replication namespace](#enable-geo-replication-at-namespace-level) +* [Configure that namespace to replicate across two or more provisioned clusters](admin-api-namespaces.md/#configure-replication-clusters) + +Any message published on *any* topic in that namespace is replicated to all clusters in the specified set. + +## Local persistence and forwarding + +When messages are produced on a Pulsar topic, messages are first persisted in the local cluster, and then forwarded asynchronously to the remote clusters. + +In normal cases, when connectivity issues are none, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, the network [round-trip time](https://en.wikipedia.org/wiki/Round-trip_delay_time) (RTT) between the remote regions defines end-to-end delivery latency. + +Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (like during a network partition). + +Producers and consumers can publish messages to and consume messages from any cluster in a Pulsar instance. However, subscriptions cannot only be local to the cluster where the subscriptions are created but also can be transferred between clusters after replicated subscription is enabled. Once replicated subscription is enabled, you can keep subscription state in synchronization. Therefore, a topic can be asynchronously replicated across multiple geographical regions. In case of failover, a consumer can restart consuming messages from the failure point in a different cluster. + +![A typical geo-replication example with full-mesh pattern](/assets/geo-replication.png) + +In the aforementioned example, the **T1** topic is replicated among three clusters, **Cluster-A**, **Cluster-B**, and **Cluster-C**. + +All messages produced in any of the three clusters are delivered to all subscriptions in other clusters. In this case, **C1** and **C2** consumers receive all messages that **P1**, **P2**, and **P3** producers publish. Ordering is still guaranteed on a per-producer basis. + +## Configure replication + +This section guides you through the steps to configure geo-replicated clusters. +1. [Connect replication clusters](#connect-replication-clusters) +2. [Grant permissions to properties](#grant-permissions-to-properties) +3. [Enable geo-replication](#enable-geo-replication) +4. [Use topics with geo-replication](#use-topics-with-geo-replication) + +### Connect replication clusters + +To replicate data among clusters, you need to configure each cluster to connect to the other. You can use the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool to create a connection. + +**Example** + +Suppose that you have 3 replication clusters: `us-west`, `us-cent`, and `us-east`. + +1. Configure the connection from `us-west` to `us-east`. + + Run the following command on `us-west`. + +```shell + +$ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-east + +``` + + :::tip + + - If you want to use a secure connection for a cluster, you can use the flags `--broker-url-secure` and `--url-secure`. For more information, see [pulsar-admin clusters create](https://pulsar.apache.org/tools/pulsar-admin/). + - Different clusters may have different authentications. You can use the authentication flag `--auth-plugin` and `--auth-parameters` together to set cluster authentication, which overrides `brokerClientAuthenticationPlugin` and `brokerClientAuthenticationParameters` if `authenticationEnabled` sets to `true` in `broker.conf` and `standalone.conf`. For more information, see [authentication and authorization](concepts-authentication.md). + + ::: + +2. Configure the connection from `us-west` to `us-cent`. + + Run the following command on `us-west`. + +```shell + +$ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-cent + +``` + +3. Run similar commands on `us-east` and `us-cent` to create connections among clusters. + +### Grant permissions to properties + +To replicate to a cluster, the tenant needs permission to use that cluster. You can grant permission to the tenant when you create the tenant or grant later. + +Specify all the intended clusters when you create a tenant: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east,us-cent + +``` + +To update permissions of an existing tenant, use `update` instead of `create`. + +### Enable geo-replication + +You can enable geo-replication at **namespace** or **topic** level. + +#### Enable geo-replication at namespace level + +You can create a namespace with the following command sample. + +```shell + +$ bin/pulsar-admin namespaces create my-tenant/my-namespace + +``` + +Initially, the namespace is not assigned to any cluster. You can assign the namespace to clusters using the `set-clusters` subcommand: + +```shell + +$ bin/pulsar-admin namespaces set-clusters my-tenant/my-namespace \ + --clusters us-west,us-east,us-cent + +``` + +#### Enable geo-replication at topic level + +You can set geo-replication at topic level using the command `pulsar-admin topics set-replication-clusters`. For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). + +```shell + +$ bin/pulsar-admin topics set-replication-clusters --clusters us-west,us-east,us-cent my-tenant/my-namespace/my-topic + +``` + +:::tip + +- You can change the replication clusters for a namespace at any time, without disruption to ongoing traffic. Replication channels are immediately set up or stopped in all clusters as soon as the configuration changes. +- Once you create a geo-replication namespace, any topics that producers or consumers create within that namespace are replicated across clusters. Typically, each application uses the `serviceUrl` for the local cluster. +- If you are using Pulsar version `2.10.x`, to enable geo-replication at topic level, you need to change the following configurations in the `conf/broker.conf` or `conf/standalone.conf` file to enable topic policies service. +```shell +systemTopicEnabled=true +topicLevelPoliciesEnabled=true +``` +::: + +### Use topics with geo-replication + +#### Selective replication + +By default, messages are replicated to all clusters configured for the namespace. You can restrict replication selectively by specifying a replication list for a message, and then that message is replicated only to the subset in the replication list. + +The following is an example for the [Java API](client-libraries-java.md). Note the use of the `setReplicationClusters` method when you construct the {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} object: + +```java + +List restrictReplicationTo = Arrays.asList( + "us-west", + "us-east" +); + +Producer producer = client.newProducer() + .topic("some-topic") + .create(); + +producer.newMessage() + .value("my-payload".getBytes()) + .setReplicationClusters(restrictReplicationTo) + .send(); + +``` + +#### Topic stats + +You can check topic-specific statistics for geo-replication topics using one of the following methods. + +````mdx-code-block + + + +Use the [`pulsar-admin topics stats`](https://pulsar.apache.org/tools/pulsar-admin/) command. + +```shell + +$ bin/pulsar-admin topics stats persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```` + +Each cluster reports its own local stats, including the incoming and outgoing replication rates and backlogs. + +#### Delete a geo-replication topic + +Given that geo-replication topics exist in multiple regions, directly deleting a geo-replication topic is not possible. Instead, you should rely on automatic topic garbage collection. + +In Pulsar, a topic is automatically deleted when the topic meets the following three conditions: +- no producers or consumers are connected to it; +- no subscriptions to it; +- no more messages are kept for retention. +For geo-replication topics, each region uses a fault-tolerant mechanism to decide when deleting the topic locally is safe. + +You can explicitly disable topic garbage collection by setting `brokerDeleteInactiveTopicsEnabled` to `false` in your [broker configuration](reference-configuration.md#broker). + +To delete a geo-replication topic, close all producers and consumers on the topic, and delete all of its local subscriptions in every replication cluster. When Pulsar determines that no valid subscription for the topic remains across the system, it will garbage collect the topic. + +## Replicated subscriptions + +Pulsar supports replicated subscriptions, so you can keep subscription state in sync, within a sub-second timeframe, in the context of a topic that is being asynchronously replicated across multiple geographical regions. + +In case of failover, a consumer can restart consuming from the failure point in a different cluster. + +### Enable replicated subscription + +Replicated subscription is disabled by default. You can enable replicated subscription when creating a consumer. + +```java + +Consumer consumer = client.newConsumer(Schema.STRING) + .topic("my-topic") + .subscriptionName("my-subscription") + .replicateSubscriptionState(true) + .subscribe(); + +``` + +### Advantages + + * It is easy to implement the logic. + * You can choose to enable or disable replicated subscription. + * When you enable it, the overhead is low, and it is easy to configure. + * When you disable it, the overhead is zero. + +### Limitations + +* When you enable replicated subscription, you're creating a consistent distributed snapshot to establish an association between message ids from different clusters. The snapshots are taken periodically. The default value is `1 second`. It means that a consumer failing over to a different cluster can potentially receive 1 second of duplicates. You can also configure the frequency of the snapshot in the `broker.conf` file. +* Only the base line cursor position is synced in replicated subscriptions while the individual acknowledgments are not synced. This means the messages acknowledged out-of-order could end up getting delivered again, in the case of a cluster failover. + +## Migrate data between clusters using geo-replication + +Using geo-replication to migrate data between clusters is a special use case of the [active-active replication pattern](concepts-replication.md/#active-active-replication) when you don't have a large amount of data. + +1. Create your new cluster. +2. Add the new cluster to your old cluster. + +```shell + + bin/pulsar-admin cluster create new-cluster + +``` + +3. Add the new cluster to your tenant. + +```shell + + bin/pulsar-admin tenants update my-tenant --cluster old-cluster,new-cluster + +``` + +4. Set the clusters on your namespace. + +```shell + + bin/pulsar-admin namespaces set-clusters my-tenant/my-ns --cluster old-cluster,new-cluster + +``` + +5. Update your applications using [replicated subscriptions](#replicated-subscriptions). +6. Validate subscription replication is active. + +```shell + + bin/pulsar-admin topics stats-internal public/default/t1 + +``` + +7. Move your consumers and producers to the new cluster by modifying the values of `serviceURL`. + +:::note + +* The replication starts from step 4, which means existing messages in your old cluster are not replicated. +* If you have some older messages to migrate, you can pre-create the replication subscriptions for each topic and set it at the earliest position by using `pulsar-admin topics create-subscription -s pulsar.repl.new-cluster -m earliest `. + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/administration-isolation.md b/site2/website/versioned_docs/version-2.10.x/administration-isolation.md new file mode 100644 index 0000000000000..b176d1f14c20d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-isolation.md @@ -0,0 +1,124 @@ +--- +id: administration-isolation +title: Pulsar isolation +sidebar_label: "Pulsar isolation" +original_id: administration-isolation +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +In an organization, a Pulsar instance provides services to multiple teams. When organizing the resources across multiple teams, you want to make a suitable isolation plan to avoid the resource competition between different teams and applications and provide high-quality messaging service. In this case, you need to take resource isolation into consideration and weigh your intended actions against expected and unexpected consequences. + +To enforce resource isolation, you can use the Pulsar isolation policy, which allows you to allocate resources (**broker** and **bookie**) for the namespace. + +## Broker isolation + +In Pulsar, when namespaces (more specifically, namespace bundles) are assigned dynamically to brokers, the namespace isolation policy limits the set of brokers that can be used for assignment. Before topics are assigned to brokers, you can set the namespace isolation policy with a primary or a secondary regex to select desired brokers. + +You can set a namespace isolation policy for a cluster using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin ns-isolation-policy set options + +``` + +For more information about the command `pulsar-admin ns-isolation-policy set options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin ns-isolation-policy set \ +--auto-failover-policy-type min_available \ +--auto-failover-policy-params min_limit=1,usage_threshold=80 \ +--namespaces my-tenant/my-namespace \ +--primary 10.193.216.* my-cluster policy-name + +``` + + + + +[PUT /admin/v2/namespaces/{tenant}/{namespace}](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/createNamespace) + + + + +For how to set namespace isolation policy using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L251). + + + + +```` + +## Bookie isolation + +A namespace can be isolated into user-defined groups of bookies, which guarantees all the data that belongs to the namespace is stored in desired bookies. The bookie affinity group uses the BookKeeper [rack-aware placement policy](https://bookkeeper.apache.org/docs/latest/api/javadoc/org/apache/bookkeeper/client/EnsemblePlacementPolicy.html) and it is a way to feed rack information which is stored as JSON format in znode. + +You can set a bookie affinity group using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin namespaces set-bookie-affinity-group options + +``` + +For more information about the command `pulsar-admin namespaces set-bookie-affinity-group options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin bookies set-bookie-rack \ +--bookie 127.0.0.1:3181 \ +--hostname 127.0.0.1:3181 \ +--group group-bookie1 \ +--rack rack1 + +bin/pulsar-admin namespaces set-bookie-affinity-group public/default \ +--primary-group group-bookie1 + +``` + +:::note + +- Do not set a bookie rack name to slash (`/`) or an empty string (`""`) if you use Pulsar earlier than 2.7.5, 2.8.3, and 2.9.2. If you use Pulsar 2.7.5, 2.8.3, 2.9.2 or later versions, it falls back to `/default-rack` or `/default-region/default-rack`. +- When `RackawareEnsemblePlacementPolicy` is enabled, the rack name is not allowed to contain slash (`/`) except for the beginning and end of the rack name string. For example, rack name like `/rack0` is okay, but `/rack/0` is not allowed. +- When `RegionawareEnsemblePlacementPolicy` is enabled, the rack name can only contain one slash (`/`) except for the beginning and end of the rack name string. For example, rack name like `/region0/rack0` is okay, but `/region0rack0` and `/region0/rack/0` are not allowed. +For the bookie rack name restrictions, see [pulsar-admin bookies set-bookie-rack](https://pulsar.apache.org/tools/pulsar-admin/). + +::: + + + + +[POST /admin/v2/namespaces/{tenant}/{namespace}/persistence/bookieAffinity](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/setBookieAffinityGroup) + + + + +For how to set bookie affinity group for a namespace using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L1164). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/administration-load-balance.md b/site2/website/versioned_docs/version-2.10.x/administration-load-balance.md new file mode 100644 index 0000000000000..397c88c5dc0f7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-load-balance.md @@ -0,0 +1,280 @@ +--- +id: administration-load-balance +title: Load balance across brokers +sidebar_label: "Load balance" +original_id: administration-load-balance +--- + + +Pulsar is a horizontally scalable messaging system, so the traffic in a logical cluster must be balanced across all the available Pulsar brokers as evenly as possible, which is a core requirement. + +You can use multiple settings and tools to control the traffic distribution which requires a bit of context to understand how the traffic is managed in Pulsar. Though in most cases, the core requirement mentioned above is true out of the box and you should not worry about it. + +The following sections introduce how the load-balanced assignments work across Pulsar brokers and how you can leverage the framework to adjust. + +## Dynamic assignments + +Topics are dynamically assigned to brokers based on the load conditions of all brokers in the cluster. The assignment of topics to brokers is not done at the topic level but at the **bundle** level (a higher level). Instead of individual topic assignments, each broker takes ownership of a subset of the topics for a namespace. This subset is called a bundle and effectively this subset is a sharding mechanism. + +In other words, each namespace is an "administrative" unit and sharded into a list of bundles, with each bundle comprising a portion of the overall hash range of the namespace. Topics are assigned to a particular bundle by taking the hash of the topic name and checking in which bundle the hash falls. Each bundle is independent of the others and thus is independently assigned to different brokers. + +The benefit of the assignment granularity is to amortize the amount of information that you need to keep track of. Based on CPU, memory, traffic load, and other indexes, topics are assigned to a particular broker dynamically. For example: +* When a client starts using new topics that are not assigned to any broker, a process is triggered to choose the best-suited broker to acquire ownership of these topics according to the load conditions. +* If the broker owning a topic becomes overloaded, the topic is reassigned to a less-loaded broker. +* If the broker owning a topic crashes, the topic is reassigned to another active broker. + +:::tip + +For partitioned topics, different partitions are assigned to different brokers. Here "topic" means either a non-partitioned topic or one partition of a topic. + +::: + +## Create namespaces with assigned bundles + +When you create a new namespace, a number of bundles are assigned to the namespace. You can set this number in the `conf/broker.conf` file: + +```conf + +# When a namespace is created without specifying the number of bundles, this +# value will be used as the default +defaultNumberOfNamespaceBundles=4 + +``` + +Alternatively, you can override the value when you create a new namespace using [Pulsar admin](/tools/pulsar-admin/): + +```shell + +bin/pulsar-admin namespaces create my-tenant/my-namespace --clusters us-west --bundles 16 + +``` + +With the above command, you create a namespace with 16 initial bundles. Therefore the topics for this namespace can immediately be spread across up to 16 brokers. + +In general, if you know the expected traffic and number of topics in advance, you had better start with a reasonable number of bundles instead of waiting for the system to auto-correct the distribution. + +On the same note, it is beneficial to start with more bundles than the number of brokers, due to the hashing nature of the distribution of topics into bundles. For example, for a namespace with 1000 topics, using something like 64 bundles achieves a good distribution of traffic across 16 brokers. + + +## Split namespace bundles + +Since the load for the topics in a bundle might change over time and predicting the load might be hard, bundle split is designed to resolve these challenges. The broker splits a bundle into two and the new smaller bundles can be reassigned to different brokers. + +Pulsar supports the following two bundle split algorithms: +* `range_equally_divide`: split the bundle into two parts with the same hash range size. +* `topic_count_equally_divide`: split the bundle into two parts with the same number of topics. + +To enable bundle split, you need to configure the following settings in the `broker.conf` file, and set `defaultNamespaceBundleSplitAlgorithm` based on your needs. + +```conf + +loadBalancerAutoBundleSplitEnabled=true +loadBalancerAutoUnloadSplitBundlesEnabled=true +defaultNamespaceBundleSplitAlgorithm=range_equally_divide + +``` + +You can configure more parameters for splitting thresholds. Any existing bundle that exceeds any of the thresholds is a candidate to be split. By default, the newly split bundles are immediately reassigned to other brokers, to facilitate the traffic distribution. + +```conf + +# maximum topics in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxTopics=1000 + +# maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxSessions=1000 + +# maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxMsgRate=30000 + +# maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxBandwidthMbytes=100 + +# maximum number of bundles in a namespace (for auto-split) +loadBalancerNamespaceMaximumBundles=128 + +``` + +## Shed load automatically + +The support for automatic load shedding is available in the load manager of Pulsar. This means that whenever the system recognizes a particular broker is overloaded, the system forces some traffic to be reassigned to less-loaded brokers. + +When a broker is identified as overloaded, the broker forces to "unload" a subset of the bundles, the ones with higher traffic, that make up for the overload percentage. + +For example, the default threshold is 85% and if a broker is over quota at 95% CPU usage, then the broker unloads the percent difference plus a 5% margin: `(95% - 85%) + 5% = 15%`. Given the selection of bundles to unload is based on traffic (as a proxy measure for CPU, network, and memory), the broker unloads bundles for at least 15% of traffic. + +:::tip + +* The automatic load shedding is enabled by default. To disable it, you can set `loadBalancerSheddingEnabled` to `false`. +* Besides the automatic load shedding, you can [manually unload bundles](#unload-topics-and-bundles). + +::: + +Additional settings that apply to shedding: + +```conf + +# Load shedding interval. Broker periodically checks whether some traffic should be offload from +# some over-loaded broker to other under-loaded brokers +loadBalancerSheddingIntervalMinutes=1 + +# Prevent the same topics to be shed and moved to other brokers more than once within this timeframe +loadBalancerSheddingGracePeriodMinutes=30 + +``` + +Pulsar supports the following types of automatic load shedding strategies. +* [ThresholdShedder](#thresholdshedder) +* [OverloadShedder](#overloadshedder) +* [UniformLoadShedder](#uniformloadshedder) + +:::note + +* From Pulsar 2.10, the **default** shedding strategy is `ThresholdShedder`. +* You need to restart brokers if the shedding strategy is [dynamically updated](admin-api-brokers.md/#dynamic-broker-configuration). + +::: + +### ThresholdShedder +This strategy tends to shed the bundles if any broker's usage is above the configured threshold. It does this by first computing the average resource usage per broker for the whole cluster. The resource usage for each broker is calculated using the following method `LocalBrokerData#getMaxResourceUsageWithWeight`. Historical observations are included in the running average based on the broker's setting for `loadBalancerHistoryResourcePercentage`. Once the average resource usage is calculated, a broker's current/historical usage is compared to the average broker usage. If a broker's usage is greater than the average usage per broker plus the `loadBalancerBrokerThresholdShedderPercentage`, this load shedder proposes removing enough bundles to bring the unloaded broker 5% below the current average broker usage. Note that recently unloaded bundles are not unloaded again. + +![Shedding strategy - ThresholdShedder](/assets/shedding-strategy-thresholdshedder.svg) + +For example, assume you have three brokers, the average broker usage of broker1 is 40%, the average broker usage of broker2 and broker3 is 10%, then the cluster average usage is 20% ((40% + 10% + 10%) / 3). If you set `loadBalancerBrokerThresholdShedderPercentage` to `10`, then only broker1's certain bundles get unloaded, because the average usage of broker1 is greater than the sum of the cluster average usage (20%) plus `loadBalancerBrokerThresholdShedderPercentage`(10%). + +To use the `ThresholdShedder` strategy, configure brokers with this value. +`loadBalancerLoadSheddingStrategy=org.apache.pulsar.broker.loadbalance.impl.ThresholdShedder` + +You can configure the weights for each resource per broker in the `conf/broker.conf` file. + +```conf + +# The BandWithIn usage weight when calculating new resource usage. The range is between 0 and 1.0. +loadBalancerBandwithInResourceWeight=1.0 + +# The BandWithOut usage weight when calculating new resource usage. The range is between 0 and 1.0. +loadBalancerBandwithOutResourceWeight=1.0 + +# The CPU usage weight when calculating new resource usage. The range is between 0 and 1.0. +loadBalancerCPUResourceWeight=1.0 + +# The heap memory usage weight when calculating new resource usage. The range is between 0 and 1.0. +loadBalancerMemoryResourceWeight=1.0 + +# The direct memory usage weight when calculating new resource usage. The range is between 0 and 1.0. +loadBalancerDirectMemoryResourceWeight=1.0 + +``` + +### OverloadShedder +This strategy attempts to shed exactly one bundle on brokers which are overloaded, that is, whose maximum system resource usage exceeds [`loadBalancerBrokerOverloadedThresholdPercentage`](#broker-overload-thresholds). To see which resources are considered when determining the maximum system resource. A bundle is recommended for unloading off that broker if and only if the following conditions hold: The broker has at least two bundles assigned and the broker has at least one bundle that has not been unloaded recently according to `LoadBalancerSheddingGracePeriodMinutes`. The unloaded bundle will be the most expensive bundle in terms of message rate that has not been recently unloaded. Note that this strategy does not take into account "underloaded" brokers when determining which bundles to unload. If you are looking for a strategy that spreads load evenly across all brokers, see [ThresholdShedder](#thresholdshedder). + +![Shedding strategy - OverloadShedder](/assets/shedding-strategy-overloadshedder.svg) + +To use the `OverloadShedder` strategy, configure brokers with this value. +`loadBalancerLoadSheddingStrategy=org.apache.pulsar.broker.loadbalance.impl.OverloadShedder` + +#### Broker overload thresholds + +The determination of when a broker is overloaded is based on the threshold of CPU, network, and memory usage. Whenever either of those metrics reaches the threshold, the system triggers the shedding (if enabled). + +:::note + +The overload threshold `loadBalancerBrokerOverloadedThresholdPercentage` only applies to the [`OverloadShedder`](#overloadshedder) shedding strategy. By default, it is set to 85%. + +::: + +Pulsar gathers the CPU, network, and memory usage stats from the system metrics. In some cases of network utilization, the network interface speed that Linux reports is not correct and needs to be manually overridden. This is the case in AWS EC2 instances with 1Gbps NIC speed for which the OS reports 10Gbps speed. + +Because of the incorrect max speed, the load manager might think the broker has not reached the NIC capacity, while in fact the broker already uses all the bandwidth and the traffic is slowed down. + +You can set `loadBalancerOverrideBrokerNicSpeedGbps` in the `conf/broker.conf` file to correct the max NIC speed. When the value is empty, Pulsar uses the value that the OS reports. + +### UniformLoadShedder +This strategy tends to distribute load uniformly across all brokers. This strategy checks the load difference between the broker with the highest load and the broker with the lowest load. If the difference is higher than configured thresholds `loadBalancerMsgRateDifferenceShedderThreshold` and `loadBalancerMsgThroughputMultiplierDifferenceShedderThreshold` then it finds out bundles that can be unloaded to distribute traffic evenly across all brokers. + +![Shedding strategy - UniformLoadShedder](/assets/shedding-strategy-uniformLoadshedder.svg) + +To use the `UniformLoadShedder` strategy, configure brokers with this value. +`loadBalancerLoadSheddingStrategy=org.apache.pulsar.broker.loadbalance.impl.UniformLoadShedder` + +## Unload topics and bundles + +You can "unload" a topic in Pulsar manual admin operations. Unloading means closing topics, releasing ownership, and reassigning topics to a new broker, based on the current load. + +When unloading happens, the client experiences a small latency blip, typically in the order of tens of milliseconds, while the topic is reassigned. + +Unloading is the mechanism that the load manager uses to perform the load shedding, but you can also trigger the unloading manually, for example, to correct the assignments and redistribute traffic even before having any broker overloaded. + +Unloading a topic has no effect on the assignment, but just closes and reopens the particular topic: + +```shell + +pulsar-admin topics unload persistent://tenant/namespace/topic + +``` + +To unload all topics for a namespace and trigger reassignments: + +```shell + +pulsar-admin namespaces unload tenant/namespace + +``` + +## Distribute anti-affinity namespaces across failure domains + +When your application has multiple namespaces and you want one of them available all the time to avoid any downtime, you can group these namespaces and distribute them across different [failure domains](reference-terminology.md#failure-domain) and different brokers. Thus, if one of the failure domains is down (due to release rollout or brokers restart), it only disrupts namespaces owned by that specific failure domain and the rest of the namespaces owned by other domains remain available without any impact. + +Such a group of namespaces has anti-affinity to each other, that is, all the namespaces in this group are [anti-affinity namespaces](reference-terminology.md#anti-affinity-namespaces) and are distributed to different failure domains in a load-balanced manner. + +As illustrated in the following figure, Pulsar has 2 failure domains (Domain1 and Domain2) and each domain has 2 brokers in it. You can create an anti-affinity namespace group that has 4 namespaces in it, and all the 4 namespaces have anti-affinity to each other. The load manager tries to distribute namespaces evenly across all the brokers in the same domain. Since each domain has 2 brokers, every broker owns one namespace from this anti-affinity namespace group, and you can see each domain owns 2 namespaces, and each broker owns 1 namespace. + +![Distribute anti-affinity namespaces across failure domains](/assets/anti-affinity-namespaces-across-failure-domains.svg) + +The load manager follows an even distribution policy across failure domains to assign anti-affinity namespaces. The following table outlines the even-distributed assignment sequence illustrated in the above figure. + +| Assignment sequence | Namespace | Failure domain candidates | Broker candidates | Selected broker | +|:---|:------------|:------------------|:------------------------------------|:-----------------| +| 1 | Namespace1 | Domain1, Domain2 | Broker1, Broker2, Broker3, Broker4 | Domain1:Broker1 | +| 2 | Namespace2 | Domain2 | Broker3, Broker4 | Domain2:Broker3 | +| 3 | Namespace3 | Domain1, Domain2 | Broker2, Broker4 | Domain1:Broker2 | +| 4 | Namespace4 | Domain2 | Broker4 | Domain2:Broker4 | + +:::tip + +* Each namespace belongs to only one anti-affinity group. If a namespace with an existing anti-affinity assignment is assigned to another anti-affinity group, the original assignment is dropped. + +* If there are more anti-affinity namespaces than failure domains, the load manager distributes namespaces evenly across all the domains, and also every domain distributes namespaces evenly across all the brokers under that domain. + +::: + +### Create a failure domain and register brokers + +:::note + +One broker can only be registered to a single failure domain. + +::: + +To create a domain under a specific cluster and register brokers, run the following command: + +```bash + +pulsar-admin clusters create-failure-domain --domain-name --broker-list + +``` + +You can also view, update, and delete domains under a specific cluster. For more information, refer to [Pulsar admin doc](/tools/pulsar-admin/). + +### Create an anti-affinity namespace group + +An anti-affinity group is created automatically when the first namespace is assigned to the group. To assign a namespace to an anti-affinity group, run the following command. It sets an anti-affinity group name for a namespace. + +```bash + +pulsar-admin namespaces set-anti-affinity-group --group + +``` + +For more information about `anti-affinity-group` related commands, refer to [Pulsar admin doc](/tools/pulsar-admin/). diff --git a/site2/website/versioned_docs/version-2.10.x/administration-proxy.md b/site2/website/versioned_docs/version-2.10.x/administration-proxy.md new file mode 100644 index 0000000000000..f45185dc45bfe --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-proxy.md @@ -0,0 +1,90 @@ +--- +id: administration-proxy +title: Pulsar proxy +sidebar_label: "Pulsar proxy" +original_id: administration-proxy +--- + +Pulsar proxy is an optional gateway. Pulsar proxy is used when direct connections between clients and Pulsar brokers are either infeasible or undesirable. For example, when you run Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, you can run Pulsar proxy. + +## Configure the proxy + +Before using the proxy, you need to configure it with the brokers addresses in the cluster. You can configure the broker URL in the proxy configuration, or the proxy to connect directly using service discovery. + +> In a production environment service discovery is not recommended. + +### Use broker URLs + +It is more secure to specify a URL to connect to the brokers. + +Proxy authorization requires access to ZooKeeper, so if you use these broker URLs to connect to the brokers, you need to disable authorization at the Proxy level. Brokers still authorize requests after the proxy forwards them. + +You can configure the broker URLs in `conf/proxy.conf` as follows. + +```properties + +brokerServiceURL=pulsar://brokers.example.com:6650 +brokerWebServiceURL=http://brokers.example.com:8080 +functionWorkerWebServiceURL=http://function-workers.example.com:8080 + +``` + +If you use TLS, configure the broker URLs in the following way: + +```properties + +brokerServiceURLTLS=pulsar+ssl://brokers.example.com:6651 +brokerWebServiceURLTLS=https://brokers.example.com:8443 +functionWorkerWebServiceURL=https://function-workers.example.com:8443 + +``` + +The hostname in the URLs provided should be a DNS entry which points to multiple brokers or a virtual IP address, which is backed by multiple broker IP addresses, so that the proxy does not lose connectivity to Pulsar cluster if a single broker becomes unavailable. + +The ports to connect to the brokers (6650 and 8080, or in the case of TLS, 6651 and 8443) should be open in the network ACLs. + +Note that if you do not use functions, you do not need to configure `functionWorkerWebServiceURL`. + +### Use service discovery + +Pulsar uses [ZooKeeper](https://zookeeper.apache.org) for service discovery. To connect the proxy to ZooKeeper, specify the following in `conf/proxy.conf`. + +```properties + +metadataStoreUrl=my-zk-0:2181,my-zk-1:2181,my-zk-2:2181 +configurationMetadataStoreUrl=my-zk-0:2184,my-zk-remote:2184 + +``` + +> To use service discovery, you need to open the network ACLs, so the proxy can connects to the ZooKeeper nodes through the ZooKeeper client port (port `2181`) and the configuration store client port (port `2184`). + +> However, it is not secure to use service discovery. Because if the network ACL is open, when someone compromises a proxy, they have full access to ZooKeeper. + +## Start the proxy + +To start the proxy: + +```bash + +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy \ + --metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 \ + --configuration-metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 + +``` + +> You can run multiple instances of the Pulsar proxy in a cluster. + +## Stop the proxy + +Pulsar proxy runs in the foreground by default. To stop the proxy, simply stop the process in which the proxy is running. + +## Proxy frontends + +You can run Pulsar proxy behind some kind of load-distributing frontend, such as an [HAProxy](https://www.digitalocean.com/community/tutorials/an-introduction-to-haproxy-and-load-balancing-concepts) load balancer. + +## Use Pulsar clients with the proxy + +Once your Pulsar proxy is up and running, preferably behind a load-distributing [frontend](#proxy-frontends), clients can connect to the proxy via whichever address that the frontend uses. If the address is the DNS address `pulsar.cluster.default`, for example, the connection URL for clients is `pulsar://pulsar.cluster.default:6650`. + +For more information on Proxy configuration, refer to [Pulsar proxy](reference-configuration.md#pulsar-proxy). diff --git a/site2/website/versioned_docs/version-2.10.x/administration-pulsar-manager.md b/site2/website/versioned_docs/version-2.10.x/administration-pulsar-manager.md new file mode 100644 index 0000000000000..40c5a33da6da8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-pulsar-manager.md @@ -0,0 +1,216 @@ +--- +id: administration-pulsar-manager +title: Pulsar Manager +sidebar_label: "Pulsar Manager" +original_id: administration-pulsar-manager +--- + +Pulsar Manager is a web-based GUI management and monitoring tool that helps administrators and users manage and monitor tenants, namespaces, topics, subscriptions, brokers, clusters, and so on, and supports dynamic configuration of multiple environments. + +:::note + +If you are monitoring your current stats with [Pulsar dashboard](administration-dashboard.md), we recommend you use Pulsar Manager instead. Pulsar dashboard is deprecated. + +::: + +## Install + +### Quick Install +The easiest way to use the Pulsar Manager is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell + +docker pull apachepulsar/pulsar-manager:v0.2.0 +docker run -it \ + -p 9527:9527 -p 7750:7750 \ + -e SPRING_CONFIGURATION_FILE=/pulsar-manager/pulsar-manager/application.properties \ + apachepulsar/pulsar-manager:v0.2.0 + +``` + +* Pulsar Manager is divided into front-end and back-end, the front-end service port is `9527` and the back-end service port is `7750`. +* `SPRING_CONFIGURATION_FILE`: Default configuration file for spring. +* By default, Pulsar Manager uses the `herddb` database. HerdDB is a SQL distributed database implemented in Java and can be found at [herddb.org](https://herddb.org/) for more information. + +### Configure Database or JWT authentication +#### Configure Database (optional) + +If you have a large amount of data, you can use a custom database. Otherwise, some display errors may occur. For example, the topic information cannot be displayed when the topic exceeds 10000. +The following is an example of PostgreSQL. + +1. Initialize database and table structures using the [file](https://github.com/apache/pulsar-manager/blob/master/src/main/resources/META-INF/sql/postgresql-schema.sql). +2. Download and modify the [configuration file](https://github.com/apache/pulsar-manager/blob/master/src/main/resources/application.properties), then add the PostgreSQL configuration. + +```properties + +spring.datasource.driver-class-name=org.postgresql.Driver +spring.datasource.url=jdbc:postgresql://127.0.0.1:5432/pulsar_manager +spring.datasource.username=postgres +spring.datasource.password=postgres + +``` + +3. Add a configuration mount and start with a docker image. + +```bash + +docker pull apachepulsar/pulsar-manager:v0.2.0 +docker run -it \ + -p 9527:9527 -p 7750:7750 \ + -v /your-path/application.properties:/pulsar-manager/pulsar- +manager/application.properties + -e SPRING_CONFIGURATION_FILE=/pulsar-manager/pulsar-manager/application.properties \ + apachepulsar/pulsar-manager:v0.2.0 + +``` + +#### Enable JWT authentication (optional) + +If you want to turn on JWT authentication, configure the `application.properties` file. + +```properties + +backend.jwt.token=token + +jwt.broker.token.mode=PRIVATE +jwt.broker.public.key=file:///path/broker-public.key +jwt.broker.private.key=file:///path/broker-private.key + +or +jwt.broker.token.mode=SECRET +jwt.broker.secret.key=file:///path/broker-secret.key + +``` + +• `backend.jwt.token`: token for the superuser. You need to configure this parameter during cluster initialization. +• `jwt.broker.token.mode`: multiple modes of generating token, including PUBLIC, PRIVATE, and SECRET. +• `jwt.broker.public.key`: configure this option if you use the PUBLIC mode. +• `jwt.broker.private.key`: configure this option if you use the PRIVATE mode. +• `jwt.broker.secret.key`: configure this option if you use the SECRET mode. +For more information, see [Token Authentication Admin of Pulsar](security-token-admin.md). + +Docker command to add profile and key files mount. + +```bash + +docker pull apachepulsar/pulsar-manager:v0.2.0 +docker run -it \ + -p 9527:9527 -p 7750:7750 \ + -v /your-path/application.properties:/pulsar-manager/pulsar- +manager/application.properties + -v /your-path/private.key:/pulsar-manager/private.key + -e SPRING_CONFIGURATION_FILE=/pulsar-manager/pulsar-manager/application.properties \ + apachepulsar/pulsar-manager:v0.2.0 + +``` + +### Set the administrator account and password + +```bash + +CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token) +curl \ + -H 'X-XSRF-TOKEN: $CSRF_TOKEN' \ + -H 'Cookie: XSRF-TOKEN=$CSRF_TOKEN;' \ + -H "Content-Type: application/json" \ + -X PUT http://localhost:7750/pulsar-manager/users/superuser \ + -d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}' + +``` + +The request parameter in curl command: + +```json + +{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"} + +``` + +- `name` is the Pulsar Manager login username, currently `admin`. +- `password` is the password of the current user of Pulsar Manager, currently `apachepulsar`. The password should be more than or equal to 6 digits. + + + +### Configure the environment +1. Login to the system, Visit http://localhost:9527 to login. The current default account is `admin/apachepulsar` + +2. Click "New Environment" button to add an environment. + +3. Input the "Environment Name". The environment name is used for identifying an environment. + +4. Input the "Service URL". The Service URL is the admin service url of your Pulsar cluster. + + +## Other Installation +### Bare-metal installation + +When using binary packages for direct deployment, you can follow these steps. + +- Download and unzip the binary package, which is available on the [Pulsar Download](/download) page. + + ```bash + + wget https://dist.apache.org/repos/dist/release/pulsar/pulsar-manager/pulsar-manager-0.2.0/apache-pulsar-manager-0.2.0-bin.tar.gz + tar -zxvf apache-pulsar-manager-0.2.0-bin.tar.gz + + ``` + +- Extract the back-end service binary package and place the front-end resources in the back-end service directory. + + ```bash + + cd pulsar-manager + tar -zxvf pulsar-manager.tar + cd pulsar-manager + cp -r ../dist ui + + ``` + +- Modify `application.properties` configuration on demand. + + > If you don't want to modify the `application.properties` file, you can add the configuration to the startup parameters via `. /bin/pulsar-manager --backend.jwt.token=token` to add the configuration to the startup parameters. This is a capability of the spring boot framework. + +- Start Pulsar Manager + + ```bash + + ./bin/pulsar-manager + + ``` + +### Custom docker image installation + +You can find the docker image in the [Docker Hub](https://github.com/apache/pulsar-manager/tree/master/docker) directory and build an image from the source code as well: + + ```bash + + git clone https://github.com/apache/pulsar-manager + cd pulsar-manager/front-end + npm install --save + npm run build:prod + cd .. + ./gradlew build -x test + cd .. + docker build -f docker/Dockerfile --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` --build-arg VCS_REF=`latest` --build-arg VERSION=`latest` -t apachepulsar/pulsar-manager . + + ``` + +## Configuration + + + +| application.properties | System env on Docker Image | Desc | Example | +| ----------------------------------- | -------------------------- | ------------------------------------------------------------ | ------------------------------------------------- | +| backend.jwt.token | JWT_TOKEN | token for the superuser. You need to configure this parameter during cluster initialization. | `token` | +| jwt.broker.token.mode | N/A | multiple modes of generating token, including PUBLIC, PRIVATE, and SECRET. | `PUBLIC` or `PRIVATE` or `SECRET`. | +| jwt.broker.public.key | PUBLIC_KEY | configure this option if you use the PUBLIC mode. | `file:///path/broker-public.key` | +| jwt.broker.private.key | PRIVATE_KEY | configure this option if you use the PRIVATE mode. | `file:///path/broker-private.key` | +| jwt.broker.secret.key | SECRET_KEY | configure this option if you use the SECRET mode. | `file:///path/broker-secret.key` | +| spring.datasource.driver-class-name | DRIVER_CLASS_NAME | the driver class name of the database. | `org.postgresql.Driver` | +| spring.datasource.url | URL | the JDBC URL of your database. | `jdbc:postgresql://127.0.0.1:5432/pulsar_manager` | +| spring.datasource.username | USERNAME | the username of database. | `postgres` | +| spring.datasource.password | PASSWORD | the password of database. | `postgres` | +| N/A | LOG_LEVEL | the level of log. | DEBUG | +* For more information about backend configurations, see [here](https://github.com/apache/pulsar-manager/blob/master/src/README). +* For more information about frontend configurations, see [here](https://github.com/apache/pulsar-manager/tree/master/front-end). + diff --git a/site2/website/versioned_docs/version-2.10.x/administration-stats.md b/site2/website/versioned_docs/version-2.10.x/administration-stats.md new file mode 100644 index 0000000000000..ac0c03602f36d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-stats.md @@ -0,0 +1,64 @@ +--- +id: administration-stats +title: Pulsar stats +sidebar_label: "Pulsar statistics" +original_id: administration-stats +--- + +## Partitioned topics + +|Stat|Description| +|---|---| +|msgRateIn| The sum of publish rates of all local and replication publishers in messages per second.| +|msgThroughputIn| Same as msgRateIn but in bytes per second instead of messages per second.| +|msgRateOut| The sum of dispatch rates of all local and replication consumers in messages per second.| +|msgThroughputOut| Same as msgRateOut but in bytes per second instead of messages per second.| +|averageMsgSize| Average message size, in bytes, from this publisher within the last interval.| +|storageSize| The sum of storage size of the ledgers for this topic.| +|publishers| The list of all local publishers into the topic. Publishers can be anywhere from zero to thousands.| +|producerId| Internal identifier for this producer on this topic.| +|producerName| Internal identifier for this producer, generated by the client library.| +|address| IP address and source port for the connection of this producer.| +|connectedSince| Timestamp this producer is created or last reconnected.| +|subscriptions| The list of all local subscriptions to the topic.| +|my-subscription| The name of this subscription (client defined).| +|msgBacklog| The count of messages in backlog for this subscription.| +|type| This subscription type.| +|msgRateExpired| The rate at which messages are discarded instead of dispatched from this subscription due to TTL.| +|consumers| The list of connected consumers for this subscription.| +|consumerName| Internal identifier for this consumer, generated by the client library.| +|availablePermits| The number of messages this consumer has space for in the listen queue of client library. A value of 0 means the queue of client library is full and receive() is not being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication| This section gives the stats for cross-colo replication of this topic.| +|replicationBacklog| The outbound replication backlog in messages.| +|connected| Whether the outbound replicator is connected.| +|replicationDelayInSeconds| How long the oldest message has been waiting to be sent through the connection, if connected is true.| +|inboundConnection| The IP and port of the broker in the publisher connection of remote cluster to this broker. | +|inboundConnectedSince| The TCP connection being used to publish messages to the remote cluster. If no local publishers are connected, this connection is automatically closed after a minute.| + + +## Topics + +|Stat|Description| +|---|---| +|entriesAddedCounter| Messages published since this broker loads this topic.| +|numberOfEntries| Total number of messages being tracked.| +|totalSize| Total storage size in bytes of all messages.| +|currentLedgerEntries| Count of messages written to the ledger currently open for writing.| +|currentLedgerSize| Size in bytes of messages written to ledger currently open for writing.| +|lastLedgerCreatedTimestamp| Time when last ledger is created.| +|lastLedgerCreationFailureTimestamp| Time when last ledger is failed.| +|waitingCursorsCount| How many cursors are caught up and waiting for a new message to be published.| +|pendingAddEntriesCount| How many messages have (asynchronous) write requests you are waiting on completion.| +|lastConfirmedEntry| The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger is opened or is being currently opened but has no entries written yet.| +|state| The state of the cursor ledger. Open means you have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers| The ordered list of all ledgers for this topic holding its messages.| +|cursors| The list of all cursors on this topic. Every subscription you saw in the topic stats has one.| +|markDeletePosition| The ack position: the last message the subscriber acknowledges receiving.| +|readPosition| The latest position of subscriber for reading message.| +|waitingReadOp| This is true when the subscription reads the latest message that is published to the topic and waits on new messages to be published.| +|pendingReadOps| The counter for how many outstanding read requests to the BookKeepers you have in progress.| +|messagesConsumedCounter| Number of messages this cursor acks since this broker loads this topic.| +|cursorLedger| The ledger used to persistently store the current markDeletePosition.| +|cursorLedgerLastEntry| The last entryid used to persistently store the current markDeletePosition.| +|individuallyDeletedMessages| If Acks are done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position.| +|lastLedgerSwitchTimestamp| The last time the cursor ledger is rolled over.| diff --git a/site2/website/versioned_docs/version-2.10.x/administration-upgrade.md b/site2/website/versioned_docs/version-2.10.x/administration-upgrade.md new file mode 100644 index 0000000000000..72d136b6460f6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-upgrade.md @@ -0,0 +1,168 @@ +--- +id: administration-upgrade +title: Upgrade Guide +sidebar_label: "Upgrade" +original_id: administration-upgrade +--- + +## Upgrade guidelines + +Apache Pulsar is comprised of multiple components, ZooKeeper, bookies, and brokers. These components are either stateful or stateless. You do not have to upgrade ZooKeeper nodes unless you have special requirement. While you upgrade, you need to pay attention to bookies (stateful), brokers and proxies (stateless). + +The following are some guidelines on upgrading a Pulsar cluster. Read the guidelines before upgrading. + +- Backup all your configuration files before upgrading. +- Read guide entirely, make a plan, and then execute the plan. When you make upgrade plan, you need to take your specific requirements and environment into consideration. +- Pay attention to the upgrading order of components. In general, you do not need to upgrade your ZooKeeper or configuration store cluster (the global ZooKeeper cluster). You need to upgrade bookies first, and then upgrade brokers, proxies, and your clients. +- If `autorecovery` is enabled, you need to disable `autorecovery` in the upgrade process, and re-enable it after completing the process. +- Read the release notes carefully for each release. Release notes contain features, configuration changes that might impact your upgrade. +- Upgrade a small subset of nodes of each type to canary test the new version before upgrading all nodes of that type in the cluster. When you have upgraded the canary nodes, run for a while to ensure that they work correctly. +- Upgrade one data center to verify new version before upgrading all data centers if your cluster runs in multi-cluster replicated mode. + +> Note: Currently, Apache Pulsar is compatible between versions. + +## Upgrade sequence + +To upgrade an Apache Pulsar cluster, follow the upgrade sequence. + +1. Upgrade ZooKeeper (optional) +- Canary test: test an upgraded version in one or a small set of ZooKeeper nodes. +- Rolling upgrade: rollout the upgraded version to all ZooKeeper servers incrementally, one at a time. Monitor your dashboard during the whole rolling upgrade process. +2. Upgrade bookies +- Canary test: test an upgraded version in one or a small set of bookies. +- Rolling upgrade: + - a. Disable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -disable + + ``` + + + - b. Rollout the upgraded version to all bookies in the cluster after you determine that a version is safe after canary. + - c. After you upgrade all bookies, re-enable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -enable + + ``` + +3. Upgrade brokers +- Canary test: test an upgraded version in one or a small set of brokers. +- Rolling upgrade: rollout the upgraded version to all brokers in the cluster after you determine that a version is safe after canary. +4. Upgrade proxies +- Canary test: test an upgraded version in one or a small set of proxies. +- Rolling upgrade: rollout the upgraded version to all proxies in the cluster after you determine that a version is safe after canary. + +## Upgrade ZooKeeper (optional) +While you upgrade ZooKeeper servers, you can do canary test first, and then upgrade all ZooKeeper servers in the cluster. + +### Canary test + +You can test an upgraded version in one of ZooKeeper servers before upgrading all ZooKeeper servers in your cluster. + +To upgrade ZooKeeper server to a new version, complete the following steps: + +1. Stop a ZooKeeper server. +2. Upgrade the binary and configuration files. +3. Start the ZooKeeper server with the new binary files. +4. Use `pulsar zookeeper-shell` to connect to the newly upgraded ZooKeeper server and run a few commands to verify if it works as expected. +5. Run the ZooKeeper server for a few days, observe and make sure the ZooKeeper cluster runs well. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic ZooKeeper node, revert the binary and configuration, and restart the ZooKeeper with the reverted binary. + +### Upgrade all ZooKeeper servers + +After canary test to upgrade one ZooKeeper in your cluster, you can upgrade all ZooKeeper servers in your cluster. + +You can upgrade all ZooKeeper servers one by one by following steps in canary test. + +## Upgrade bookies + +While you upgrade bookies, you can do canary test first, and then upgrade all bookies in the cluster. +For more details, you can read Apache BookKeeper [Upgrade guide](http://bookkeeper.apache.org/docs/latest/admin/upgrade). + +### Canary test + +You can test an upgraded version in one or a small set of bookies before upgrading all bookies in your cluster. + +To upgrade bookie to a new version, complete the following steps: + +1. Stop a bookie. +2. Upgrade the binary and configuration files. +3. Start the bookie in `ReadOnly` mode to verify if the bookie of this new version runs well for read workload. + + ```shell + + bin/pulsar bookie --readOnly + + ``` + +4. When the bookie runs successfully in `ReadOnly` mode, stop the bookie and restart it in `Write/Read` mode. + + ```shell + + bin/pulsar bookie + + ``` + +5. Observe and make sure the cluster serves both write and read traffic. + +#### Canary rollback + +If issues occur during the canary test, you can shut down the problematic bookie node. Other bookies in the cluster replaces this problematic bookie node with autorecovery. + +### Upgrade all bookies + +After canary test to upgrade some bookies in your cluster, you can upgrade all bookies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, upgrade one bookie at a time. In a downtime upgrade scenario, shut down the entire cluster, upgrade each bookie, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each bookie. + +1. Stop the bookie. +2. Upgrade the software (either new binary or new configuration files). +2. Start the bookie. + +> **Advanced operations** +> When you upgrade a large BookKeeper cluster in a rolling upgrade scenario, upgrading one bookie at a time is slow. If you configure rack-aware or region-aware placement policy, you can upgrade bookies rack by rack or region by region, which speeds up the whole upgrade process. + +## Upgrade brokers and proxies + +The upgrade procedure for brokers and proxies is the same. Brokers and proxies are `stateless`, so upgrading the two services is easy. + +### Canary test + +You can test an upgraded version in one or a small set of nodes before upgrading all nodes in your cluster. + +To upgrade to a new version, complete the following steps: + +1. Stop a broker (or proxy). +2. Upgrade the binary and configuration file. +3. Start a broker (or proxy). + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic broker (or proxy) node. Revert to the old version and restart the broker (or proxy). + +### Upgrade all brokers or proxies + +After canary test to upgrade some brokers or proxies in your cluster, you can upgrade all brokers or proxies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, you can upgrade one broker or one proxy at a time if the size of the cluster is small. If your cluster is large, you can upgrade brokers or proxies in batches. When you upgrade a batch of brokers or proxies, make sure the remaining brokers and proxies in the cluster have enough capacity to handle the traffic during upgrade. + +In a downtime upgrade scenario, shut down the entire cluster, upgrade each broker or proxy, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each broker or proxy. + +1. Stop the broker or proxy. +2. Upgrade the software (either new binary or new configuration files). +3. Start the broker or proxy. diff --git a/site2/website/versioned_docs/version-2.10.x/administration-zk-bk.md b/site2/website/versioned_docs/version-2.10.x/administration-zk-bk.md new file mode 100644 index 0000000000000..0530b258dca2c --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/administration-zk-bk.md @@ -0,0 +1,378 @@ +--- +id: administration-zk-bk +title: ZooKeeper and BookKeeper administration +sidebar_label: "ZooKeeper and BookKeeper" +original_id: administration-zk-bk +--- + +Pulsar relies on two external systems for essential tasks: + +* [ZooKeeper](https://zookeeper.apache.org/) is responsible for a wide variety of configuration-related and coordination-related tasks. +* [BookKeeper](http://bookkeeper.apache.org/) is responsible for [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data. + +ZooKeeper and BookKeeper are both open-source [Apache](https://www.apache.org/) projects. + +> Skip to the [How Pulsar uses ZooKeeper and BookKeeper](#how-pulsar-uses-zookeeper-and-bookkeeper) section below for a more schematic explanation of the role of these two systems in Pulsar. + + +## ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +To deploy a Pulsar instance, you need to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the node ID in `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you can set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy configuration store + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you need to stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, you can assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also you can assume, each cluster has its own local ZK servers named such as + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +### ZooKeeper configuration + +In Pulsar, ZooKeeper configuration is handled by two separate configuration files in the `conf` directory of your Pulsar installation: +* The `conf/zookeeper.conf` file handles the configuration for local ZooKeeper. +* The `conf/global-zookeeper.conf` file handles the configuration for configuration store. +See [parameters](reference-configuration.md#zookeeper) for more details. + +#### Configure batching operations +Using the batching operations reduces the remote procedure call (RPC) traffic between ZooKeeper client and servers. It also reduces the number of write transactions, because each batching operation corresponds to a single ZooKeeper transaction, containing multiple read and write operations. + +The following figure demonstrates a basic benchmark of batching read/write operations that can be requested to ZooKeeper in one second: + +![Zookeeper batching benchmark](/assets/zookeeper-batching.png) + +To enable batching operations, set the [`metadataStoreBatchingEnabled`](reference-configuration.md#broker) parameter to `true` on the broker side. + + +## BookKeeper + +BookKeeper stores all durable messages in Pulsar. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) WAL system that guarantees read consistency of independent message logs calls ledgers. Individual BookKeeper servers are also called *bookies*. + +> To manage message persistence, retention, and expiry in Pulsar, refer to [cookbook](cookbooks-retention-expiry.md). + +### Hardware requirements + +Bookie hosts store message data on disk. To provide optimal performance, ensure that the bookies have a suitable hardware configuration. The following are two key dimensions of bookie hardware capacity: + +- Disk I/O capacity read/write +- Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker by default. To ensure low write latency, BookKeeper is designed to use multiple devices: + +- A **journal** to ensure durability. For sequential writes, it is critical to have fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +- A **ledger storage device** stores data. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + +### Configure BookKeeper + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. When you configure each bookie, ensure that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for local ZooKeeper of the Pulsar cluster. + +The minimum configuration changes required in `conf/bookkeeper.conf` are as follows: + +:::note + +Set `journalDirectory` and `ledgerDirectories` carefully. It is difficilt to change them later. + +::: + +```properties + +# Change to point to journal disk mount point +journalDirectory=data/bookkeeper/journal + +# Point to ledger storage disk mount point +ledgerDirectories=data/bookkeeper/ledgers + +# Point to local ZK quorum +zkServers=zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181 + +#It is recommended to set this parameter. Otherwise, BookKeeper can't start normally in certain environments (for example, Huawei Cloud). +advertisedAddress= + +``` + +To change the ZooKeeper root path that BookKeeper uses, use `zkLedgersRootPath=/MY-PREFIX/ledgers` instead of `zkServers=localhost:2181/MY-PREFIX`. + +> For more information about BookKeeper, refer to the official [BookKeeper docs](http://bookkeeper.apache.org). + +### Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. Each Pulsar broker has its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Start bookies manually + +You can start a bookie in the foreground or as a background daemon. + +To start a bookie in the foreground, use the [`bookkeeper`](reference-cli-tools.md#bookkeeper) CLI tool: + +```bash + +$ bin/bookkeeper bookie + +``` + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify whether the bookie works properly with the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell + +$ bin/bookkeeper shell bookiesanity + +``` + +When you use this command, you create a new ledger on the local bookie, write a few entries, read them back and finally delete the ledger. + +### Decommission bookies cleanly + +Before you decommission a bookie, you need to check your environment and meet the following requirements. + +1. Ensure the state of your cluster supports decommissioning the target bookie. Check if `EnsembleSize >= Write Quorum >= Ack Quorum` is `true` with one less bookie. + +2. Ensure the target bookie is listed after using the `listbookies` command. + +3. Ensure that no other process is ongoing (upgrade etc). + +And then you can decommission bookies safely. To decommission bookies, complete the following steps. + +1. Log in to the bookie node, check if there are underreplicated ledgers. The decommission command force to replicate the underreplicated ledgers. +`$ bin/bookkeeper shell listunderreplicated` + +2. Stop the bookie by killing the bookie process. Make sure that no liveness/readiness probes setup for the bookies to spin them back up if you deploy it in a Kubernetes environment. + +3. Run the decommission command. + - If you have logged in to the node to be decommissioned, you do not need to provide `-bookieid`. + - If you are running the decommission command for the target bookie node from another bookie node, you should mention the target bookie ID in the arguments for `-bookieid` + `$ bin/bookkeeper shell decommissionbookie` + or + `$ bin/bookkeeper shell decommissionbookie -bookieid ` + +4. Validate that no ledgers are on the decommissioned bookie. +`$ bin/bookkeeper shell listledgers -bookieid ` + +You can run the following command to check if the bookie you have decommissioned is listed in the bookies list: + +```bash + +./bookkeeper shell listbookies -rw -h +./bookkeeper shell listbookies -ro -h + +``` + +## BookKeeper persistence policies + +In Pulsar, you can set *persistence policies* at the namespace level, which determines how BookKeeper handles persistent storage of messages. Policies determine four things: + +* The number of acks (guaranteed copies) to wait for each ledger entry. +* The number of bookies to use for a topic. +* The number of writes to make for each ledger entry. +* The throttling rate for mark-delete operations. + +### Set persistence policies + +You can set persistence policies for BookKeeper at the [namespace](reference-terminology.md#namespace) level. + +#### Pulsar-admin + +Use the [`set-persistence`](reference-pulsar-admin.md#namespaces-set-persistence) subcommand and specify a namespace as well as any policies that you want to apply. The available flags are: + +Flag | Description | Default +:----|:------------|:------- +`-a`, `--bookkeeper-ack-quorum` | The number of acks (guaranteed copies) to wait on for each entry | 0 +`-e`, `--bookkeeper-ensemble` | The number of [bookies](reference-terminology.md#bookie) to use for topics in the namespace | 0 +`-w`, `--bookkeeper-write-quorum` | The number of writes to make for each entry | 0 +`-r`, `--ml-mark-delete-max-rate` | Throttling rate for mark-delete operations (0 means no throttle) | 0 + +The following is an example: + +```shell + +$ pulsar-admin namespaces set-persistence my-tenant/my-ns \ + --bookkeeper-ack-quorum 3 \ + --bookkeeper-ensemble 2 + +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +int bkEnsemble = 2; +int bkQuorum = 3; +int bkAckQuorum = 2; +double markDeleteRate = 0.7; +PersistencePolicies policies = + new PersistencePolicies(ensemble, quorum, ackQuorum, markDeleteRate); +admin.namespaces().setPersistence(namespace, policies); + +``` + +### List persistence policies + +You can see which persistence policy currently applies to a namespace. + +#### Pulsar-admin + +Use the [`get-persistence`](reference-pulsar-admin.md#namespaces-get-persistence) subcommand and specify the namespace. + +The following is an example: + +```shell + +$ pulsar-admin namespaces get-persistence my-tenant/my-ns +{ + "bookkeeperEnsemble": 1, + "bookkeeperWriteQuorum": 1, + "bookkeeperAckQuorum", 1, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +PersistencePolicies policies = admin.namespaces().getPersistence(namespace); + +``` + +## How Pulsar uses ZooKeeper and BookKeeper + +This diagram illustrates the role of ZooKeeper and BookKeeper in a Pulsar cluster: + +![ZooKeeper and BookKeeper](/assets/pulsar-system-architecture.png) + +Each Pulsar cluster consists of one or more message brokers. Each broker relies on an ensemble of bookies. diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-cgo.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-cgo.md new file mode 100644 index 0000000000000..feee2cac3bafb --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-cgo.md @@ -0,0 +1,581 @@ +--- +id: client-libraries-cgo +title: Pulsar CGo client +sidebar_label: "CGo(deprecated)" +original_id: client-libraries-cgo +--- + +> The CGo client has been deprecated since version 2.7.0. If possible, use the [Go client](client-libraries-go.md) instead. + +You can use Pulsar Go client to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Go client are thread-safe. + +Currently, the following Go clients are maintained in two repositories. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| CGo | [pulsar-client-go](https://github.com/apache/pulsar/tree/master/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | CGo client that depends on C++ client library | +| Go | [pulsar-client-go](https://github.com/apache/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar/pulsar-client-go/pulsar). + +## Installation + +### Requirements + +Pulsar Go client library is based on the C++ client library. Follow +the instructions for [C++ library](client-libraries-cpp.md) for installing the binaries through [RPM](client-libraries-cpp.md#rpm), [Deb](client-libraries-cpp.md#deb) or [Homebrew packages](client-libraries-cpp.md#macos). + +### Install go package + +> **Compatibility Warning** +> The version number of the Go client **must match** the version number of the Pulsar C++ client library. + +You can install the `pulsar` library locally using `go get`. Note that `go get` doesn't support fetching a specific tag - it will always pull in master's version of the Go client. You'll need a C++ client library that matches master. + +```bash + +$ go get -u github.com/apache/pulsar/pulsar-client-go/pulsar + +``` + +Or you can use [dep](https://github.com/golang/dep) for managing the dependencies. + +```bash + +$ dep ensure -add github.com/apache/pulsar/pulsar-client-go/pulsar@v@pulsar:version@ + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar/pulsar-client-go/pulsar" + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "runtime" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeoutSeconds: 5, + MessageListenerThreads: runtime.NumCPU(), + }) + + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } +} + +``` + +The following configurable parameters are available for Pulsar clients: + +Parameter | Description | Default +:---------|:------------|:------- +`URL` | The connection URL for the Pulsar cluster. See [above](#urls) for more info | +`IOThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker) | 1 +`OperationTimeoutSeconds` | The timeout for some Go client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 +`MessageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)) | 1 +`ConcurrentLookupRequests` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 5000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 5000 +`Logger` | A custom logger implementation for the client (as a function that takes a log level, file path, line number, and message). All info, warn, and error messages will be routed to this function. | `nil` +`TLSTrustCertsFilePath` | The file path for the trusted TLS certificate | +`TLSAllowInsecureConnection` | Whether the client accepts untrusted TLS certificates from the broker | `false` +`Authentication` | Configure the authentication provider. (default: no authentication). Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | `nil` +`StatsIntervalInSeconds` | The interval (in seconds) at which client stats are published | 60 + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatalf("Could not instantiate Pulsar producer: %v", err) +} + +defer producer.Close() + +msg := pulsar.ProducerMessage{ + Payload: []byte("Hello, Pulsar"), +} + +if err := producer.Send(context.Background(), msg); err != nil { + log.Fatalf("Producer could not send message: %v", err) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar producer, the operation will block (waiting on a go channel) until either a producer is successfully created or an error is thrown. + + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | `error` +`SendAndGetMsgID(context.Context, ProducerMessage)`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | (MessageID, error) +`SendAsync(context.Context, ProducerMessage, func(ProducerMessage, error))` | Publishes a [message](#messages) to the producer's topic asynchronously. The third argument is a callback function that specifies what happens either when the message is acknowledged or an error is thrown. | +`SendAndGetMsgIDAsync(context.Context, ProducerMessage, func(MessageID, error))`| Send a message in asynchronous mode. The callback will report back the message being published and the eventual error in publishing | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | `error` +`Schema()` | | Schema + +Here's a more involved example usage of a producer: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client to instantiate a producer + producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", + }) + + if err != nil { log.Fatal(err) } + + ctx := context.Background() + + // Send 10 messages synchronously and 10 messages asynchronously + for i := 0; i < 10; i++ { + // Create a message + msg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("message-%d", i)), + } + + // Attempt to send the message + if err := producer.Send(ctx, msg); err != nil { + log.Fatal(err) + } + + // Create a different message to send asynchronously + asyncMsg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("async-message-%d", i)), + } + + // Attempt to send the message asynchronously and handle the response + producer.SendAsync(ctx, asyncMsg, func(msg pulsar.ProducerMessage, err error) { + if err != nil { log.Fatal(err) } + + fmt.Printf("the %s successfully published", string(msg.Payload)) + }) + } +} + +``` + +### Producer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer will publish messages | +`Name` | A name for the producer. If you don't explicitly assign a name, Pulsar will automatically generate a globally unique name that you can access later using the `Name()` method. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | +`Properties`| Attach a set of application defined properties to the producer. This properties will be visible in the topic stats | +`SendTimeout` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `SendTimeout` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30 seconds +`MaxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `Send` and `SendAsync` methods will fail *unless* `BlockIfQueueFull` is set to `true`. | +`MaxPendingMessagesAcrossPartitions` | Set the number of max pending messages across all the partitions. This setting will be used to lower the max pending messages for each partition `MaxPendingMessages(int)`, if the total exceeds the configured value.| +`BlockIfQueueFull` | If set to `true`, the producer's `Send` and `SendAsync` methods will block when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `MaxPendingMessages` parameter); if set to `false` (the default), `Send` and `SendAsync` operations will fail and throw a `ProducerQueueIsFullError` when the queue is full. | `false` +`MessageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`pulsar.RoundRobinDistribution`, the default), publishing all messages to a single partition (`pulsar.UseSinglePartition`), or a custom partitioning scheme (`pulsar.CustomPartition`). | `pulsar.RoundRobinDistribution` +`HashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `pulsar.JavaStringHash` (the equivalent of `String.hashCode()` in Java), `pulsar.Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `pulsar.BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library) | `pulsar.JavaStringHash` +`CompressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), [`ZLIB`](https://zlib.net/), [`ZSTD`](https://facebook.github.io/zstd/) and [`SNAPPY`](https://google.github.io/snappy/). | No compression +`MessageRouter` | By default, Pulsar uses a round-robin routing scheme for [partitioned topics](cookbooks-partitioned.md). The `MessageRouter` parameter enables you to specify custom routing logic via a function that takes the Pulsar message and topic metadata as an argument and returns an integer (where the ), i.e. a function signature of `func(Message, TopicMetadata) int`. | +`Batching` | Control whether automatic batching of messages is enabled for the producer. | false +`BatchingMaxPublishDelay` | Set the time period within which the messages sent will be batched (default: 1ms) if batch messages are enabled. If set to a non zero value, messages will be queued until this time interval or until | 1ms +`BatchingMaxMessages` | Set the maximum number of messages permitted in a batch. (default: 1000) If set to a value greater than 1, messages will be queued until this threshold is reached or batch interval has elapsed | 1000 + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +msgChannel := make(chan pulsar.ConsumerMessage) + +consumerOpts := pulsar.ConsumerOptions{ + Topic: "my-topic", + SubscriptionName: "my-subscription-1", + Type: pulsar.Exclusive, + MessageChannel: msgChannel, +} + +consumer, err := client.Subscribe(consumerOpts) + +if err != nil { + log.Fatalf("Could not establish subscription: %v", err) +} + +defer consumer.Close() + +for cm := range msgChannel { + msg := cm.Message + + fmt.Printf("Message ID: %s", msg.ID()) + fmt.Printf("Message value: %s", string(msg.Payload())) + + consumer.Ack(msg) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar consumer, the operation will block (on a go channel) until either a producer is successfully created or an error is thrown. + + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the consumer's [topic](reference-terminology.md#topic) | `string` +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | `error` +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | `error` +`AckCumulative(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `AckCumulative` method will block until the ack has been sent to the broker. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can only be used with a [shared](concepts-messaging.md#shared) subscription type. | `error` +`AckCumulativeID(MessageID)` |Ack the reception of all the messages in the stream up to (and including) the provided message. This method will block until the acknowledge has been sent to the broker. After that, the messages will not be re-delivered to this consumer. | error +`Nack(Message)` | Acknowledge the failure to process a single message. | `error` +`NackID(MessageID)` | Acknowledge the failure to process a single message. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | `error` +`RedeliverUnackedMessages()` | Redelivers *all* unacknowledged messages on the topic. In [failover](concepts-messaging.md#failover) mode, this request is ignored if the consumer isn't active on the specified topic; in [shared](concepts-messaging.md#shared) mode, redelivered messages are distributed across all consumers connected to the topic. **Note**: this is a *non-blocking* operation that doesn't throw an error. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | error + +#### Receive example + +Here's an example usage of a Go consumer that uses the `Receive()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client object to instantiate a consumer + consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-golang-topic", + SubscriptionName: "sub-1", + Type: pulsar.Exclusive, + }) + + if err != nil { log.Fatal(err) } + + defer consumer.Close() + + ctx := context.Background() + + // Listen indefinitely on the topic + for { + msg, err := consumer.Receive(ctx) + if err != nil { log.Fatal(err) } + + // Do something with the message + err = processMessage(msg) + + if err == nil { + // Message processed successfully + consumer.Ack(msg) + } else { + // Failed to process messages + consumer.Nack(msg) + } + } +} + +``` + +### Consumer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the consumer will establish a subscription and listen for messages | +`Topics` | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing | +`TopicsPattern` | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | +`SubscriptionName` | The subscription name for this consumer | +`Properties` | Attach a set of application defined properties to the consumer. This properties will be visible in the topic stats| +`Name` | The name of the consumer | +`AckTimeout` | Set the timeout for unacked messages | 0 +`NackRedeliveryDelay` | The delay after which to redeliver the messages that failed to be processed. Default is 1min. (See `Consumer.Nack()`) | 1 minute +`Type` | Available options are `Exclusive`, `Shared`, and `Failover` | `Exclusive` +`SubscriptionInitPos` | InitialPosition at which the cursor will be set when subscribe | Latest +`MessageChannel` | The Go channel used by the consumer. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `Receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 +`MaxTotalReceiverQueueSizeAcrossPartitions` |Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value | 50000 +`ReadCompacted` | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the consumer will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal. | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageId: pulsar.LatestMessage, +}) + +``` + +> **Blocking operation** +> When you create a new Pulsar reader, the operation will block (on a go channel) until either a reader is successfully created or an error is thrown. + + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` + +#### "Next" example + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatalf("Could not create client: %v", err) } + + // Use the client to instantiate a reader + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.EarliestMessage, + }) + + if err != nil { log.Fatalf("Could not create reader: %v", err) } + + defer reader.Close() + + ctx := context.Background() + + // Listen on the topic for incoming messages + for { + msg, err := reader.Next(ctx) + if err != nil { log.Fatalf("Error reading from topic: %v", err) } + + // Process the message + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: DeserializeMessageID(lastSavedId), +}) + +``` + +### Reader configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader will establish a subscription and listen for messages +`Name` | The name of the reader +`StartMessageID` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `pulsar.EarliestMessage` (the earliest available message on the topic), `pulsar.LatestMessage` (the latest available message on the topic), or a `MessageID` object for a position that isn't earliest or latest. | +`MessageChannel` | The Go channel used by the reader. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `Next`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 +`SubscriptionRolePrefix` | The subscription role prefix. | `reader` +`ReadCompacted` | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the reader will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal.| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## Schema + +This example shows how to create a producer and consumer with schema. + +```go + +var exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +jsonSchema := NewJsonSchema(exampleSchemaDef, nil) +// create producer +producer, err := client.CreateProducerWithSchema(ProducerOptions{ + Topic: "jsonTopic", +}, jsonSchema) +err = producer.Send(context.Background(), ProducerMessage{ + Value: &testJson{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() +//create consumer +var s testJson +consumerJS := NewJsonSchema(exampleSchemaDef, nil) +consumer, err := client.SubscribeWithSchema(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-2", +}, consumerJS) +if err != nil { + log.Fatal(err) +} +msg, err := consumer.Receive(context.Background()) +if err != nil { + log.Fatal(err) +} +err = msg.GetValue(&s) +if err != nil { + log.Fatal(err) +} +fmt.Println(s.ID) // output: 100 +fmt.Println(s.Name) // output: pulsar +defer consumer.Close() + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-cpp.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-cpp.md new file mode 100644 index 0000000000000..f5b8ae3678de2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-cpp.md @@ -0,0 +1,765 @@ +--- +id: client-libraries-cpp +title: Pulsar C++ client +sidebar_label: "C++" +original_id: client-libraries-cpp +--- + +You can use Pulsar C++ client to create Pulsar producers and consumers in C++. + +All the methods in producer, consumer, and reader of a C++ client are thread-safe. + +## Supported platforms + +Pulsar C++ client is supported on **Linux** ,**MacOS** and **Windows** platforms. + +[Doxygen](http://www.doxygen.nl/)-generated API docs for the C++ client are available [here](/api/cpp). + + +## Linux + +:::note + +You can choose one of the following installation methods based on your needs: Compilation, Install RPM or Install Debian. + +::: + +### Compilation + +#### System requirements + +You need to install the following components before using the C++ client: + +* [CMake](https://cmake.org/) +* [Boost](http://www.boost.org/) +* [Protocol Buffers](https://developers.google.com/protocol-buffers/) >= 3 +* [libcurl](https://curl.se/libcurl/) +* [Google Test](https://github.com/google/googletest) + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +$ apt-get install cmake libssl-dev libcurl4-openssl-dev liblog4cxx-dev \ + libprotobuf-dev protobuf-compiler libboost-all-dev google-mock libgtest-dev libjsoncpp-dev + +``` + +3. Compile and install [Google Test](https://github.com/google/googletest). + +```shell + +# libgtest-dev version is 1.18.0 or above +$ cd /usr/src/googletest +$ sudo cmake . +$ sudo make +$ sudo cp ./googlemock/libgmock.a ./googlemock/gtest/libgtest.a /usr/lib/ + +# less than 1.18.0 +$ cd /usr/src/gtest +$ sudo cmake . +$ sudo make +$ sudo cp libgtest.a /usr/lib + +$ cd /usr/src/gmock +$ sudo cmake . +$ sudo make +$ sudo cp libgmock.a /usr/lib + +``` + +4. Compile the Pulsar client library for C++ inside the Pulsar repository. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +After you install the components successfully, the files `libpulsar.so` and `libpulsar.a` are in the `lib` folder of the repository. The tools `perfProducer` and `perfConsumer` are in the `perf` directory. + +### Install Dependencies + +> Since 2.1.0 release, Pulsar ships pre-built RPM and Debian packages. You can download and install those packages directly. + +After you download and install RPM or DEB, the `libpulsar.so`, `libpulsarnossl.so`, `libpulsar.a`, and `libpulsarwithdeps.a` libraries are in your `/usr/lib` directory. + +By default, they are built in code path `${PULSAR_HOME}/pulsar-client-cpp`. You can build with the command below. + + `cmake . -DBUILD_TESTS=OFF -DLINK_STATIC=ON && make pulsarShared pulsarSharedNossl pulsarStatic pulsarStaticWithDeps -j 3`. + +These libraries rely on some other libraries. If you want to get detailed version of dependencies, see [RPM](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/rpm/Dockerfile) or [DEB](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/deb/Dockerfile) files. + +1. `libpulsar.so` is a shared library, containing statically linked `boost` and `openssl`. It also dynamically links all other necessary libraries. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.so -I/usr/local/ssl/include + +``` + +2. `libpulsarnossl.so` is a shared library, similar to `libpulsar.so` except that the libraries `openssl` and `crypto` are dynamically linked. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarnossl.so -lssl -lcrypto -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +3. `libpulsar.a` is a static library. You need to load dependencies before using this library. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib -lboost_system -lboost_regex -lcurl -lprotobuf -lzstd -lz + +``` + +4. `libpulsarwithdeps.a` is a static library, based on `libpulsar.a`. It is archived in the dependencies of `libboost_regex`, `libboost_system`, `libcurl`, `libprotobuf`, `libzstd` and `libz`. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarwithdeps.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +The `libpulsarwithdeps.a` does not include library openssl related libraries `libssl` and `libcrypto`, because these two libraries are related to security. It is more reasonable and easier to use the versions provided by the local system to handle security issues and upgrade libraries. + +### Install RPM + +1. Download a RPM package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:dist_rpm:client@) | [asc](@pulsar:dist_rpm:client@.asc), [sha512](@pulsar:dist_rpm:client@.sha512) | +| [client-debuginfo](@pulsar:dist_rpm:client-debuginfo@) | [asc](@pulsar:dist_rpm:client-debuginfo@.asc), [sha512](@pulsar:dist_rpm:client-debuginfo@.sha512) | +| [client-devel](@pulsar:dist_rpm:client-devel@) | [asc](@pulsar:dist_rpm:client-devel@.asc), [sha512](@pulsar:dist_rpm:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ rpm -ivh apache-pulsar-client*.rpm + +``` + +After you install RPM successfully, Pulsar libraries are in the `/usr/lib` directory, for example: + +```bash + +lrwxrwxrwx 1 root root 18 Dec 30 22:21 libpulsar.so -> libpulsar.so.2.9.1 +lrwxrwxrwx 1 root root 23 Dec 30 22:21 libpulsarnossl.so -> libpulsarnossl.so.2.9.1 + +``` + +:::note + +If you get the error that `libpulsar.so: cannot open shared object file: No such file or directory` when starting Pulsar client, you may need to run `ldconfig` first. + +::: + +2. Install the GCC and g++ using the following command, otherwise errors would occur in installing Node.js. + +```bash + +$ sudo yum -y install gcc automake autoconf libtool make +$ sudo yum -y install gcc-c++ + +``` + +### Install Debian + +1. Download a Debian package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:deb:client@) | [asc](@pulsar:dist_deb:client@.asc), [sha512](@pulsar:dist_deb:client@.sha512) | +| [client-devel](@pulsar:deb:client-devel@) | [asc](@pulsar:dist_deb:client-devel@.asc), [sha512](@pulsar:dist_deb:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ apt install ./apache-pulsar-client*.deb + +``` + +After you install DEB successfully, Pulsar libraries are in the `/usr/lib` directory. + +### Build + +> If you want to build RPM and Debian packages from the latest master, follow the instructions below. You should run all the instructions at the root directory of your cloned Pulsar repository. + +There are recipes that build RPM and Debian packages containing a +statically linked `libpulsar.so` / `libpulsarnossl.so` / `libpulsar.a` / `libpulsarwithdeps.a` with all required dependencies. + +To build the C++ library packages, you need to build the Java packages first. + +```shell + +mvn install -DskipTests + +``` + +#### RPM + +To build the RPM inside a Docker container, use the command below. The RPMs are in the `pulsar-client-cpp/pkg/rpm/RPMS/x86_64/` path. + +```shell + +pulsar-client-cpp/pkg/rpm/docker-build-rpm.sh + +``` + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-devel | Static library `libpulsar.a`, `libpulsarwithdeps.a`and C++ and C headers | +| pulsar-client-debuginfo | Debug symbols for `libpulsar.so` | + +#### Debian + +To build Debian packages, enter the following command. + +```shell + +pulsar-client-cpp/pkg/deb/docker-build-deb.sh + +``` + +Debian packages are created in the `pulsar-client-cpp/pkg/deb/BUILD/DEB/` path. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-dev | Static library `libpulsar.a`, `libpulsarwithdeps.a` and C++ and C headers | + +## MacOS + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +# OpenSSL installation +$ brew install openssl +$ export OPENSSL_INCLUDE_DIR=/usr/local/opt/openssl/include/ +$ export OPENSSL_ROOT_DIR=/usr/local/opt/openssl/ + +# Protocol Buffers installation +$ brew install protobuf boost boost-python log4cxx +# If you are using python3, you need to install boost-python3 + +# Google Test installation +$ git clone https://github.com/google/googletest.git +$ cd googletest +$ git checkout release-1.12.1 +$ cmake . +$ make install + +``` + +3. Compile the Pulsar client library in the repository that you cloned. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +### Install `libpulsar` + +Pulsar releases are available in the [Homebrew](https://brew.sh/) core repository. You can install the C++ client library with the following command. The package is installed with the library and headers. + +```shell + +brew install libpulsar + +``` + +## Windows (64-bit) + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +cd ${PULSAR_HOME}/pulsar-client-cpp +vcpkg install --feature-flags=manifests --triplet x64-windows + +``` + +3. Build C++ libraries. + +```shell + +cmake -B ./build -A x64 -DBUILD_PYTHON_WRAPPER=OFF -DBUILD_TESTS=OFF -DVCPKG_TRIPLET=x64-windows -DCMAKE_BUILD_TYPE=Release -S . +cmake --build ./build --config Release + +``` + +> **NOTE** +> +> 1. For Windows 32-bit, you need to use `-A Win32` and `-DVCPKG_TRIPLET=x86-windows`. +> 2. For MSVC Debug mode, you need to replace `Release` with `Debug` for both `CMAKE_BUILD_TYPE` variable and `--config` option. + +4. Client libraries are available in the following places. + +``` + +${PULSAR_HOME}/pulsar-client-cpp/build/lib/Release/pulsar.lib +${PULSAR_HOME}/pulsar-client-cpp/build/lib/Release/pulsar.dll + +``` + +## Connection URLs + +To connect Pulsar using client libraries, you need to specify a Pulsar protocol URL. + +Pulsar protocol URLs are assigned to specific clusters, you can use the Pulsar URI scheme. The default port is `6650`. The following is an example for localhost. + +```http + +pulsar://localhost:6650 + +``` + +In a Pulsar cluster in production, the URL looks as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use TLS authentication, you need to add `ssl`, and the default port is `6651`. The following is an example. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a producer + +To use Pulsar as a producer, you need to create a producer on the C++ client. There are two main ways of using a producer: +- [Blocking style](#simple-blocking-example) : each call to `send` waits for an ack from the broker. +- [Non-blocking asynchronous style](#non-blocking-example) : `sendAsync` is called instead of `send` and a callback is supplied for when the ack is received from the broker. + +### Simple blocking example + +This example sends 100 messages using the blocking style. While simple, it does not produce high throughput as it waits for each ack to come back before sending the next message. + +```c++ + +#include +#include + +using namespace pulsar; + +int main() { + Client client("pulsar://localhost:6650"); + + Result result = client.createProducer("persistent://public/default/my-topic", producer); + if (result != ResultOk) { + std::cout << "Error creating producer: " << result << std::endl; + return -1; + } + + // Send 100 messages synchronously + int ctr = 0; + while (ctr < 100) { + std::string content = "msg" + std::to_string(ctr); + Message msg = MessageBuilder().setContent(content).setProperty("x", "1").build(); + Result result = producer.send(msg); + if (result != ResultOk) { + std::cout << "The message " << content << " could not be sent, received code: " << result << std::endl; + } else { + std::cout << "The message " << content << " sent successfully" << std::endl; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ctr++; + } + + std::cout << "Finished producing synchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Non-blocking example + +This example sends 100 messages using the non-blocking style calling `sendAsync` instead of `send`. This allows the producer to have multiple messages inflight at a time which increases throughput. + +The producer configuration `blockIfQueueFull` is useful here to avoid `ResultProducerQueueIsFull` errors when the internal queue for outgoing send requests becomes full. Once the internal queue is full, `sendAsync` becomes blocking which can make your code simpler. + +Without this configuration, the result code `ResultProducerQueueIsFull` is passed to the callback. You must decide how to deal with that (retry, discard etc). + +```c++ + +#include +#include + +using namespace pulsar; + +std::atomic acksReceived; + +void callback(Result code, const MessageId& msgId, std::string msgContent) { + // message processing logic here + std::cout << "Received ack for msg: " << msgContent << " with code: " + << code << " -- MsgID: " << msgId << std::endl; + acksReceived++; +} + +int main() { + Client client("pulsar://localhost:6650"); + + ProducerConfiguration producerConf; + producerConf.setBlockIfQueueFull(true); + Producer producer; + Result result = client.createProducer("persistent://public/default/my-topic", + producerConf, producer); + if (result != ResultOk) { + std::cout << "Error creating producer: " << result << std::endl; + return -1; + } + + // Send 100 messages asynchronously + int ctr = 0; + while (ctr < 100) { + std::string content = "msg" + std::to_string(ctr); + Message msg = MessageBuilder().setContent(content).setProperty("x", "1").build(); + producer.sendAsync(msg, std::bind(callback, + std::placeholders::_1, std::placeholders::_2, content)); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ctr++; + } + + // wait for 100 messages to be acked + while (acksReceived < 100) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + std::cout << "Finished producing asynchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Partitioned topics and lazy producers + +When scaling out a Pulsar topic, you may configure a topic to have hundreds of partitions. Likewise, you may have also scaled out your producers so there are hundreds or even thousands of producers. This can put some strain on the Pulsar brokers as when you create a producer on a partitioned topic, internally it creates one internal producer per partition which involves communications to the brokers for each one. So for a topic with 1000 partitions and 1000 producers, it ends up creating 1,000,000 internal producers across the producer applications, each of which has to communicate with a broker to find out which broker it should connect to and then perform the connection handshake. + +You can reduce the load caused by this combination of a large number of partitions and many producers by doing the following: +- use SinglePartition partition routing mode (this ensures that all messages are only sent to a single, randomly selected partition) +- use non-keyed messages (when messages are keyed, routing is based on the hash of the key and so messages will end up being sent to multiple partitions) +- use lazy producers (this ensures that an internal producer is only created on demand when a message needs to be routed to a partition) + +With our example above, that reduces the number of internal producers spread out over the 1000 producer apps from 1,000,000 to just 1000. + +Note that there can be extra latency for the first message sent. If you set a low send timeout, this timeout could be reached if the initial connection handshake is slow to complete. + +```c++ + +ProducerConfiguration producerConf; +producerConf.setPartitionsRoutingMode(ProducerConfiguration::UseSinglePartition); +producerConf.setLazyStartPartitionedProducers(true); + +``` + +### Enable chunking + +Message [chunking](concepts-messaging.md#chunking) enables Pulsar to process large payload messages by splitting the message into chunks at the producer side and aggregating chunked messages at the consumer side. + +The message chunking feature is OFF by default. The following is an example about how to enable message chunking when creating a producer. + +```c++ + +ProducerConfiguration conf; +conf.setBatchingEnabled(false); +conf.setChunkingEnabled(true); +Producer producer; +client.createProducer("my-topic", conf, producer); + +``` + +> **Note:** To enable chunking, you need to disable batching (`setBatchingEnabled`=`false`) concurrently. + +## Create a consumer + +To use Pulsar as a consumer, you need to create a consumer on the C++ client. There are two main ways of using the consumer: +- [Blocking style](#blocking-example): synchronously calling `receive(msg)`. +- [Non-blocking](#consumer-with-a-message-listener) (event based) style: using a message listener. + +### Blocking example + +The benefit of this approach is that it is the simplest code. Simply keeps calling `receive(msg)` which blocks until a message is received. + +This example starts a subscription at the earliest offset and consumes 100 messages. + +```c++ + +#include + +using namespace pulsar; + +int main() { + Client client("pulsar://localhost:6650"); + + Consumer consumer; + ConsumerConfiguration config; + config.setSubscriptionInitialPosition(InitialPositionEarliest); + Result result = client.subscribe("persistent://public/default/my-topic", "consumer-1", config, consumer); + if (result != ResultOk) { + std::cout << "Failed to subscribe: " << result << std::endl; + return -1; + } + + Message msg; + int ctr = 0; + // consume 100 messages + while (ctr < 100) { + consumer.receive(msg); + std::cout << "Received: " << msg + << " with payload '" << msg.getDataAsString() << "'" << std::endl; + + consumer.acknowledge(msg); + ctr++; + } + + std::cout << "Finished consuming synchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Consumer with a message listener + +You can avoid running a loop with blocking calls with an event based style by using a message listener which is invoked for each message that is received. + +This example starts a subscription at the earliest offset and consumes 100 messages. + +```c++ + +#include +#include +#include + +using namespace pulsar; + +std::atomic messagesReceived; + +void handleAckComplete(Result res) { + std::cout << "Ack res: " << res << std::endl; +} + +void listener(Consumer consumer, const Message& msg) { + std::cout << "Got message " << msg << " with content '" << msg.getDataAsString() << "'" << std::endl; + messagesReceived++; + consumer.acknowledgeAsync(msg.getMessageId(), handleAckComplete); +} + +int main() { + Client client("pulsar://localhost:6650"); + + Consumer consumer; + ConsumerConfiguration config; + config.setMessageListener(listener); + config.setSubscriptionInitialPosition(InitialPositionEarliest); + Result result = client.subscribe("persistent://public/default/my-topic", "consumer-1", config, consumer); + if (result != ResultOk) { + std::cout << "Failed to subscribe: " << result << std::endl; + return -1; + } + + // wait for 100 messages to be consumed + while (messagesReceived < 100) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + std::cout << "Finished consuming asynchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Configure chunking + +You can limit the maximum number of chunked messages a consumer maintains concurrently by configuring the `setMaxPendingChunkedMessage` and `setAutoAckOldestChunkedMessageOnQueueFull` parameters. When the threshold is reached, the consumer drops pending messages by silently acknowledging them or asking the broker to redeliver them later. + +The following is an example of how to configure message chunking. + +```c++ + +ConsumerConfiguration conf; +conf.setAutoAckOldestChunkedMessageOnQueueFull(true); +conf.setMaxPendingChunkedMessage(100); +Consumer consumer; +client.subscribe("my-topic", "my-sub", conf, consumer); + +``` + +## Enable authentication in connection URLs +If you use TLS authentication when connecting to Pulsar, you need to add `ssl` in the connection URLs, and the default port is `6651`. The following is an example. + +```cpp + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://my-broker.com:6651", config); + +``` + +For complete examples, refer to [C++ client examples](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/examples). + +## Schema + +This section describes some examples about schema. For more information about +schema, see [Pulsar schema](schema-get-started.md). + +### Avro schema + +- The following example shows how to create a producer with an Avro schema. + + ```cpp + + static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; + Producer producer; + ProducerConfiguration producerConf; + producerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); + client.createProducer("topic-avro", producerConf, producer); + + ``` + +- The following example shows how to create a consumer with an Avro schema. + + ```cpp + + static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; + ConsumerConfiguration consumerConf; + Consumer consumer; + consumerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); + client.subscribe("topic-avro", "sub-2", consumerConf, consumer) + + ``` + +### ProtobufNative schema + +The following example shows how to create a producer and a consumer with a ProtobufNative schema. +​ +1. Generate the `User` class using Protobuf3. + + :::note + + You need to use Protobuf3 or later versions. + + ::: + +​ + + ```protobuf + + syntax = "proto3"; + + message User { + string name = 1; + int32 age = 2; + } + + ``` + +​ +2. Include the `ProtobufNativeSchema.h` in your source code. Ensure the Protobuf dependency has been added to your project. +​ + + ```c++ + + #include + + ``` + +​ +3. Create a producer to send a `User` instance. +​ + + ```c++ + + ProducerConfiguration producerConf; + producerConf.setSchema(createProtobufNativeSchema(User::GetDescriptor())); + Producer producer; + client.createProducer("topic-protobuf", producerConf, producer); + User user; + user.set_name("my-name"); + user.set_age(10); + std::string content; + user.SerializeToString(&content); + producer.send(MessageBuilder().setContent(content).build()); + + ``` + +​ +4. Create a consumer to receive a `User` instance. +​ + + ```c++ + + ConsumerConfiguration consumerConf; + consumerConf.setSchema(createProtobufNativeSchema(User::GetDescriptor())); + consumerConf.setSubscriptionInitialPosition(InitialPositionEarliest); + Consumer consumer; + client.subscribe("topic-protobuf", "my-sub", consumerConf, consumer); + Message msg; + consumer.receive(msg); + User user2; + user2.ParseFromArray(msg.getData(), msg.getLength()); + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-dotnet.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-dotnet.md new file mode 100644 index 0000000000000..52b6200c478af --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-dotnet.md @@ -0,0 +1,456 @@ +--- +id: client-libraries-dotnet +title: Pulsar C# client +sidebar_label: "C#" +original_id: client-libraries-dotnet +--- + +You can use the Pulsar C# client (DotPulsar) to create Pulsar producers and consumers in C#. All the methods in the producer, consumer, and reader of a C# client are thread-safe. The official documentation for DotPulsar is available [here](https://github.com/apache/pulsar-dotpulsar/wiki). + +## Installation + +You can install the Pulsar C# client library either through the dotnet CLI or through the Visual Studio. This section describes how to install the Pulsar C# client library through the dotnet CLI. For information about how to install the Pulsar C# client library through the Visual Studio, see [here](https://docs.microsoft.com/en-us/visualstudio/mac/nuget-walkthrough?view=vsmac-2019). + +### Prerequisites + +Install the [.NET Core SDK](https://dotnet.microsoft.com/download/), which provides the dotnet command-line tool. Starting in Visual Studio 2017, the dotnet CLI is automatically installed with any .NET Core related workloads. + +### Procedures + +To install the Pulsar C# client library, following these steps: + +1. Create a project. + + 1. Create a folder for the project. + + 2. Open a terminal window and switch to the new folder. + + 3. Create the project using the following command. + + ``` + + dotnet new console + + ``` + + 4. Use `dotnet run` to test that the app has been created properly. + +2. Add the DotPulsar NuGet package. + + 1. Use the following command to install the `DotPulsar` package. + + ``` + + dotnet add package DotPulsar + + ``` + + 2. After the command completes, open the `.csproj` file to see the added reference. + + ```xml + + + + + + ``` + +## Client + +This section describes some configuration examples for the Pulsar C# client. + +### Create client + +This example shows how to create a Pulsar C# client connected to localhost. + +```c# + +using DotPulsar; + +var client = PulsarClient.Builder().Build(); + +``` + +To create a Pulsar C# client by using the builder, you can specify the following options. + +| Option | Description | Default | +| ---- | ---- | ---- | +| ServiceUrl | Set the service URL for the Pulsar cluster. | pulsar://localhost:6650 | +| RetryInterval | Set the time to wait before retrying an operation or a reconnection. | 3s | + +### Create producer + +This section describes how to create a producer. + +- Create a producer by using the builder. + + ```c# + + using DotPulsar; + using DotPulsar.Extensions; + + var producer = client.NewProducer()) + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a producer without using the builder. + + ```c# + + using DotPulsar; + + var options = new ProducerOptions("persistent://public/default/mytopic", Schema.ByteArray); + var producer = client.CreateProducer(options); + + ``` + +### Create consumer + +This section describes how to create a consumer. + +- Create a consumer by using the builder. + + ```c# + + using DotPulsar; + using DotPulsar.Extensions; + + var consumer = client.NewConsumer() + .SubscriptionName("MySubscription") + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a consumer without using the builder. + + ```c# + + using DotPulsar; + + var options = new ConsumerOptions("MySubscription", "persistent://public/default/mytopic", Schema.ByteArray); + var consumer = client.CreateConsumer(options); + + ``` + +### Create reader + +This section describes how to create a reader. + +- Create a reader by using the builder. + + ```c# + + using DotPulsar; + using DotPulsar.Extensions; + + var reader = client.NewReader() + .StartMessageId(MessageId.Earliest) + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a reader without using the builder. + + ```c# + + using DotPulsar; + + var options = new ReaderOptions(MessageId.Earliest, "persistent://public/default/mytopic", Schema.ByteArray); + var reader = client.CreateReader(options); + + ``` + +### Configure encryption policies + +The Pulsar C# client supports four kinds of encryption policies: + +- `EnforceUnencrypted`: always use unencrypted connections. +- `EnforceEncrypted`: always use encrypted connections) +- `PreferUnencrypted`: use unencrypted connections, if possible. +- `PreferEncrypted`: use encrypted connections, if possible. + +This example shows how to set the `EnforceUnencrypted` encryption policy. + +```c# + +using DotPulsar; + +var client = PulsarClient.Builder() + .ConnectionSecurity(EncryptionPolicy.EnforceEncrypted) + .Build(); + +``` + +### Configure authentication + +Currently, the Pulsar C# client supports the TLS (Transport Layer Security) and JWT (JSON Web Token) authentication. + +If you have followed [Authentication using TLS](security-tls-authentication.md), you get a certificate and a key. To use them from the Pulsar C# client, follow these steps: + +1. Create an unencrypted and password-less pfx file. + + ```c# + + openssl pkcs12 -export -keypbe NONE -certpbe NONE -out admin.pfx -inkey admin.key.pem -in admin.cert.pem -passout pass: + + ``` + +2. Use the admin.pfx file to create an X509Certificate2 and pass it to the Pulsar C# client. + + ```c# + + using System.Security.Cryptography.X509Certificates; + using DotPulsar; + + var clientCertificate = new X509Certificate2("admin.pfx"); + var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + + ``` + +## Producer + +A producer is a process that attaches to a topic and publishes messages to a Pulsar broker for processing. This section describes some configuration examples about the producer. + +## Send data + +This example shows how to send data. + +```c# + +var data = Encoding.UTF8.GetBytes("Hello World"); +await producer.Send(data); + +``` + +### Send messages with customized metadata + +- Send messages with customized metadata by using the builder. + + ```c# + + var messageId = await producer.NewMessage() + .Property("SomeKey", "SomeValue") + .Send(data); + + ``` + +- Send messages with customized metadata without using the builder. + + ```c# + + var data = Encoding.UTF8.GetBytes("Hello World"); + var metadata = new MessageMetadata(); + metadata["SomeKey"] = "SomeValue"; + var messageId = await producer.Send(metadata, data)); + + ``` + +## Consumer + +A consumer is a process that attaches to a topic through a subscription and then receives messages. This section describes some configuration examples about the consumer. + +### Receive messages + +This example shows how a consumer receives messages from a topic. + +```c# + +await foreach (var message in consumer.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +### Acknowledge messages + +Messages can be acknowledged individually or cumulatively. For details about message acknowledgement, see [acknowledgement](concepts-messaging.md#acknowledgement). + +- Acknowledge messages individually. + + ```c# + + await consumer.Acknowledge(message); + + ``` + +- Acknowledge messages cumulatively. + + ```c# + + await consumer.AcknowledgeCumulative(message); + + ``` + +### Unsubscribe from topics + +This example shows how a consumer unsubscribes from a topic. + +```c# + +await consumer.Unsubscribe(); + +``` + +#### Note + +> A consumer cannot be used and is disposed once the consumer unsubscribes from a topic. + +## Reader + +A reader is actually just a consumer without a cursor. This means that Pulsar does not keep track of your progress and there is no need to acknowledge messages. + +This example shows how a reader receives messages. + +```c# + +await foreach (var message in reader.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +## Monitoring + +This section describes how to monitor the producer, consumer, and reader state. + +### Monitor producer + +The following table lists states available for the producer. + +| State | Description | +| ---- | ----| +| Closed | The producer or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | +| PartiallyConnected | Some of the sub-producers are disconnected. | + +This example shows how to monitor the producer state. + +```c# + +private static async ValueTask Monitor(IProducer producer, CancellationToken cancellationToken) +{ + var state = ProducerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = (await producer.StateChangedFrom(state, cancellationToken)).ProducerState; + + var stateMessage = state switch + { + ProducerState.Connected => $"The producer is connected", + ProducerState.Disconnected => $"The producer is disconnected", + ProducerState.Closed => $"The producer has closed", + ProducerState.Faulted => $"The producer has faulted", + ProducerState.PartiallyConnected => $"The producer is partially connected.", + _ => $"The producer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (producer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor consumer state + +The following table lists states available for the consumer. + +| State | Description | +| ---- | ----| +| Active | All is well. | +| Inactive | All is well. The subscription type is `Failover` and you are not the active consumer. | +| Closed | The consumer or the Pulsar client has been disposed. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | +| Unsubscribed | The consumer has unsubscribed. | + +This example shows how to monitor the consumer state. + +```c# + +private static async ValueTask Monitor(IConsumer consumer, CancellationToken cancellationToken) +{ + var state = ConsumerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = (await consumer.StateChangedFrom(state, cancellationToken)).ConsumerState; + + var stateMessage = state switch + { + ConsumerState.Active => "The consumer is active", + ConsumerState.Inactive => "The consumer is inactive", + ConsumerState.Disconnected => "The consumer is disconnected", + ConsumerState.Closed => "The consumer has closed", + ConsumerState.ReachedEndOfTopic => "The consumer has reached end of topic", + ConsumerState.Faulted => "The consumer has faulted", + ConsumerState.Unsubscribed => "The consumer is unsubscribed.", + _ => $"The consumer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (consumer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor reader state + +The following table lists states available for the reader. + +| State | Description | +| ---- | ----| +| Closed | The reader or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | + +This example shows how to monitor the reader state. + +```c# + +private static async ValueTask Monitor(IReader reader, CancellationToken cancellationToken) +{ + var state = ReaderState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = (await reader.StateChangedFrom(state, cancellationToken)).ReaderState; + + var stateMessage = state switch + { + ReaderState.Connected => "The reader is connected", + ReaderState.Disconnected => "The reader is disconnected", + ReaderState.Closed => "The reader has closed", + ReaderState.ReachedEndOfTopic => "The reader has reached end of topic", + ReaderState.Faulted => "The reader has faulted", + _ => $"The reader has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (reader.IsFinalState(state)) + return; + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-go.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-go.md new file mode 100644 index 0000000000000..d2f5dd5a13d0d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-go.md @@ -0,0 +1,1064 @@ +--- +id: client-libraries-go +title: Pulsar Go client +sidebar_label: "Go" +original_id: client-libraries-go +--- + +> Tips: The CGo client has been deprecated since version 2.7.0. + +You can use Pulsar [Go client](https://github.com/apache/pulsar-client-go) to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar-client-go/pulsar). + + +## Installation + +### Install go package + +You can get the `pulsar` library by using `go get` or use it with `go module`. + +Download the library of Go client to local environment: + +```bash + +$ go get -u "github.com/apache/pulsar-client-go/pulsar" + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +``` + +Use with go module: + +```bash + +$ mkdir test_dir && cd test_dir + +``` + +Write a sample script in the `test_dir` directory (such as `test_example.go`) and write `package main` at the beginning of the file. + +```bash + +$ go mod init test_dir +$ go mod tidy && go mod download +$ go build test_example.go +$ ./test_example + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, you can set the URL as below. + +``` + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "time" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +If you have multiple brokers, you can initiate a client object as below. + +```go + +import ( + "log" + "time" + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650,localhost:6651,localhost:6652", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +The following configurable parameters are available for Pulsar clients: + + Name | Description | Default +| :-------- | :---------- |:---------- | +| URL | Configure the service URL for the Pulsar service.

If you have multiple brokers, you can set multiple Pulsar cluster addresses for a client.

This parameter is **required**. |None | +| ConnectionTimeout | Timeout for the establishment of a TCP connection | 30s | +| OperationTimeout| Set the operation timeout. Producer-create, subscribe and unsubscribe operations will be retried until this interval, after which the operation will be marked as failed| 30s| +| Authentication | Configure the authentication provider. Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | no authentication | +| TLSTrustCertsFilePath | Set the path to the trusted TLS certificate file | | +| TLSAllowInsecureConnection | Configure whether the Pulsar client accept untrusted TLS certificate from broker | false | +| TLSValidateHostname | Configure whether the Pulsar client verify the validity of the host name from broker | false | +| ListenerName | Configure the net model for VPC users to connect to the Pulsar broker | | +| MaxConnectionsPerBroker | Max number of connections to a single broker that is kept in the pool | 1 | +| CustomMetricsLabels | Add custom labels to all the metrics reported by this client instance | | +| Logger | Configure the logger used by the client | logrus.StandardLogger | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatal(err) +} + +_, err = producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte("hello"), +}) + +defer producer.Close() + +if err != nil { + fmt.Println("Failed to publish message", err) +} +fmt.Println("Published message") + +``` + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, *ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | (MessageID, error) +`SendAsync(context.Context, *ProducerMessage, func(MessageID, *ProducerMessage, error))`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | + +### Producer Example + +#### How to use message router in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: serviceURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +// Only subscribe on the specific partition +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-partitioned-topic-partition-2", + SubscriptionName: "my-sub", +}) + +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-partitioned-topic", + MessageRouter: func(msg *ProducerMessage, tm TopicMetadata) int { + fmt.Println("Routing message ", msg, " -- Partitions: ", tm.NumPartitions()) + return 2 + }, +}) + +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +``` + +#### How to use schema interface in producer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +properties := make(map[string]string) +properties["pulsar"] = "hello" +jsonSchemaWithProperties := NewJSONSchema(exampleSchemaDef, properties) +producer, err := client.CreateProducer(ProducerOptions{ + Topic: "jsonTopic", + Schema: jsonSchemaWithProperties, +}) +assert.Nil(t, err) + +_, err = producer.Send(context.Background(), &ProducerMessage{ + Value: &testJSON{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +producer.Close() + +``` + +#### How to use delay relative in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topicName := newTopicName() +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicName, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topicName, + SubscriptionName: "subName", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +ID, err := producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("test")), + DeliverAfter: 3 * time.Second, +}) +if err != nil { + log.Fatal(err) +} +fmt.Println(ID) + +ctx, canc := context.WithTimeout(context.Background(), 1*time.Second) +msg, err := consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +ctx, canc = context.WithTimeout(context.Background(), 5*time.Second) +msg, err = consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +``` + +#### How to use Prometheus metrics in producer + +Pulsar Go client registers client metrics using Prometheus. This section demonstrates how to create a simple Pulsar producer application that exposes Prometheus metrics via HTTP. + +1. Write a simple producer application. + +```go + +// Create a Pulsar client +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} + +defer client.Close() + +// Start a separate goroutine for Prometheus metrics +// In this case, Prometheus metrics can be accessed via http://localhost:2112/metrics +go func() { + prometheusPort := 2112 + log.Printf("Starting Prometheus metrics at http://localhost:%v/metrics\n", prometheusPort) + http.Handle("/metrics", promhttp.Handler()) + err = http.ListenAndServe(":"+strconv.Itoa(prometheusPort), nil) + if err != nil { + log.Fatal(err) + } +}() + +// Create a producer +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "topic-1", +}) +if err != nil { + log.Fatal(err) +} + +defer producer.Close() + +ctx := context.Background() + +// Write your business logic here +// In this case, you build a simple Web server. You can produce messages by requesting http://localhost:8082/produce +webPort := 8082 +http.HandleFunc("/produce", func(w http.ResponseWriter, r *http.Request) { + msgId, err := producer.Send(ctx, &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("hello world")), + }) + if err != nil { + log.Fatal(err) + } else { + log.Printf("Published message: %v", msgId) + fmt.Fprintf(w, "Published message: %v", msgId) + } +}) + +err = http.ListenAndServe(":"+strconv.Itoa(webPort), nil) +if err != nil { + log.Fatal(err) +} + +``` + +2. To scrape metrics from applications, configure a local running Prometheus instance using a configuration file (`prometheus.yml`). + +```yaml + +scrape_configs: +- job_name: pulsar-client-go-metrics + scrape_interval: 10s + static_configs: + - targets: + - localhost:2112 + +``` + +Now you can query Pulsar client metrics on Prometheus. + +### Producer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name specify a name for the producer. If not assigned, the system will generate a globally unique name which can be access with Producer.ProducerName(). | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| SendTimeout | SendTimeout set the timeout for a message that is not acknowledged by the server | 30s | +| DisableBlockIfQueueFull | DisableBlockIfQueueFull control whether Send and SendAsync block if producer's message queue is full | false | +| MaxPendingMessages| MaxPendingMessages set the max size of the queue holding the messages pending to receive an acknowledgment from the broker. | | +| HashingScheme | HashingScheme change the `HashingScheme` used to chose the partition on where to publish a particular message. | JavaStringHash | +| CompressionType | CompressionType set the compression type for the producer. | not compressed | +| CompressionLevel | Define the desired compression level. Options: Default, Faster and Better | Default | +| MessageRouter | MessageRouter set a custom message routing policy by passing an implementation of MessageRouter | | +| DisableBatching | DisableBatching control whether automatic batching of messages is enabled for the producer. | false | +| BatchingMaxPublishDelay | BatchingMaxPublishDelay set the time period within which the messages sent will be batched | 1ms | +| BatchingMaxMessages | BatchingMaxMessages set the maximum number of messages permitted in a batch. | 1000 | +| BatchingMaxSize | BatchingMaxSize sets the maximum number of bytes permitted in a batch. | 128KB | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ProducerInterceptor` interface. | None | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker | ultimate | +| BatcherBuilderType | BatcherBuilderType sets the batch builder type. This is used to create a batch container when batching is enabled. Options: DefaultBatchBuilder and KeyBasedBatchBuilder | DefaultBatchBuilder | + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-sub", + Type: pulsar.Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +for i := 0; i < 10; i++ { + msg, err := consumer.Receive(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) +} + +if err := consumer.Unsubscribe(); err != nil { + log.Fatal(err) +} + +``` + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Chan()` | Chan returns a channel from which to consume messages. | `<-chan ConsumerMessage` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | +`ReconsumeLater(msg Message, delay time.Duration)` | ReconsumeLater mark a message for redelivery after custom delay | +`Nack(Message)` | Acknowledge the failure to process a single message. | +`NackID(MessageID)` | Acknowledge the failure to process a single message. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this consumer to a specific message publish time. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | +`Name()` | Name returns the name of consumer | `string` + +### Receive example + +#### How to use regex consumer + +```go + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) + +defer client.Close() + +p, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicInRegex, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer p.Close() + +topicsPattern := fmt.Sprintf("persistent://%s/foo.*", namespace) +opts := pulsar.ConsumerOptions{ + TopicsPattern: topicsPattern, + SubscriptionName: "regex-sub", +} +consumer, err := client.Subscribe(opts) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use multi topics Consumer + +```go + +func newTopicName() string { + return fmt.Sprintf("my-topic-%v", time.Now().Nanosecond()) +} + + +topic1 := "topic-1" +topic2 := "topic-2" + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +topics := []string{topic1, topic2} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topics: topics, + SubscriptionName: "multi-topic-sub", +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use consumer listener + +```go + +import ( + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + channel := make(chan pulsar.ConsumerMessage, 100) + + options := pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-subscription", + Type: pulsar.Shared, + } + + options.MessageChannel = channel + + consumer, err := client.Subscribe(options) + if err != nil { + log.Fatal(err) + } + + defer consumer.Close() + + // Receive messages from channel. The channel returns a struct which contains message and the consumer from where + // the message was received. It's not necessary here since we have 1 single consumer, but the channel could be + // shared across multiple consumers as well + for cm := range channel { + msg := cm.Message + fmt.Printf("Received message msgId: %v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) + } +} + +``` + +#### How to use consumer receive timeout + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "test-topic-with-no-messages" +ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) +defer cancel() + +// create consumer +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topic, + SubscriptionName: "my-sub1", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +msg, err := consumer.Receive(ctx) +fmt.Println(msg.Payload()) +if err != nil { + log.Fatal(err) +} + +``` + +#### How to use schema in consumer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +var s testJSON + +consumerJS := NewJSONSchema(exampleSchemaDef, nil) +consumer, err := client.Subscribe(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-1", + Schema: consumerJS, + SubscriptionInitialPosition: SubscriptionPositionEarliest, +}) +assert.Nil(t, err) +msg, err := consumer.Receive(context.Background()) +assert.Nil(t, err) +err = msg.GetSchemaValue(&s) +if err != nil { + log.Fatal(err) +} + +defer consumer.Close() + +``` + +#### How to use Prometheus metrics in consumer + +In this guide, This section demonstrates how to create a simple Pulsar consumer application that exposes Prometheus metrics via HTTP. +1. Write a simple consumer application. + +```go + +// Create a Pulsar client +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} + +defer client.Close() + +// Start a separate goroutine for Prometheus metrics +// In this case, Prometheus metrics can be accessed via http://localhost:2112/metrics +go func() { + prometheusPort := 2112 + log.Printf("Starting Prometheus metrics at http://localhost:%v/metrics\n", prometheusPort) + http.Handle("/metrics", promhttp.Handler()) + err = http.ListenAndServe(":"+strconv.Itoa(prometheusPort), nil) + if err != nil { + log.Fatal(err) + } +}() + +// Create a consumer +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "sub-1", + Type: pulsar.Shared, +}) +if err != nil { + log.Fatal(err) +} + +defer consumer.Close() + +ctx := context.Background() + +// Write your business logic here +// In this case, you build a simple Web server. You can consume messages by requesting http://localhost:8083/consume +webPort := 8083 +http.HandleFunc("/consume", func(w http.ResponseWriter, r *http.Request) { + msg, err := consumer.Receive(ctx) + if err != nil { + log.Fatal(err) + } else { + log.Printf("Received message msgId: %v -- content: '%s'\n", msg.ID(), string(msg.Payload())) + fmt.Fprintf(w, "Received message msgId: %v -- content: '%s'\n", msg.ID(), string(msg.Payload())) + consumer.Ack(msg) + } +}) + +err = http.ListenAndServe(":"+strconv.Itoa(webPort), nil) +if err != nil { + log.Fatal(err) +} + +``` + +2. To scrape metrics from applications, configure a local running Prometheus instance using a configuration file (`prometheus.yml`). + +```yaml + +scrape_configs: +- job_name: pulsar-client-go-metrics + scrape_interval: 10s + static_configs: + - targets: + - localhost:2112 + +``` + +Now you can query Pulsar client metrics on Prometheus. + +### Consumer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Topics | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing| | +| TopicsPattern | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | | +| AutoDiscoveryPeriod | Specify the interval in which to poll for new partitions or new topics if using a TopicsPattern. | | +| SubscriptionName | Specify the subscription name for this consumer. This argument is required when subscribing | | +| Name | Set the consumer name | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| Type | Select the subscription type to be used when subscribing to the topic. | Exclusive | +| SubscriptionInitialPosition | InitialPosition at which the cursor will be set when subscribe | Latest | +| DLQ | Configuration for Dead Letter Queue consumer policy. | no DLQ | +| MessageChannel | Sets a `MessageChannel` for the consumer. When a message is received, it will be pushed to the channel for consumption | | +| ReceiverQueueSize | Sets the size of the consumer receive queue. | 1000| +| NackRedeliveryDelay | The delay after which to redeliver the messages that failed to be processed | 1min | +| ReadCompacted | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic | false | +| ReplicateSubscriptionState | Mark the subscription as replicated to keep it in sync across clusters | false | +| KeySharedPolicy | Configuration for Key Shared consumer policy. | | +| RetryEnable | Auto retry send messages to default filled DLQPolicy topics | false | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ConsumerInterceptor` interface. | | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker. | ultimate | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), +}) +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +``` + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` +`Seek(MessageID)` | Reset the subscription associated with this reader to a specific message ID | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this reader to a specific message publish time | `error` + +### Reader example + +#### How to use reader to read 'next' message + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), + }) + if err != nil { + log.Fatal(err) + } + defer reader.Close() + + for reader.HasNext() { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.DeserializeMessageID(lastSavedId), +}) + +``` + +#### How to use reader to read specific message + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: lookupURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "topic-1" +ctx := context.Background() + +// create producer +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topic, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +// send 10 messages +msgIDs := [10]MessageID{} +for i := 0; i < 10; i++ { + msgID, err := producer.Send(ctx, &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("hello-%d", i)), + }) + assert.NoError(t, err) + assert.NotNil(t, msgID) + msgIDs[i] = msgID +} + +// create reader on 5th message (not included) +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], +}) + +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +// receive the remaining 5 messages +for i := 5; i < 10; i++ { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) +} + +// create reader on 5th message (included) +readerInclusive, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], + StartMessageIDInclusive: true, +}) + +if err != nil { + log.Fatal(err) +} +defer readerInclusive.Close() + +``` + +### Reader configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name set the reader name. | | +| Properties | Attach a set of application defined properties to the reader. This properties will be visible in the topic stats | | +| StartMessageID | StartMessageID initial reader positioning is done by specifying a message id. | | +| StartMessageIDInclusive | If true, the reader will start at the `StartMessageID`, included. Default is `false` and the reader will start from the "next" message | false | +| MessageChannel | MessageChannel sets a `MessageChannel` for the consumer When a message is received, it will be pushed to the channel for consumption| | +| ReceiverQueueSize | ReceiverQueueSize sets the size of the consumer receive queue. | 1000 | +| SubscriptionRolePrefix| SubscriptionRolePrefix set the subscription role prefix. | “reader” | +| ReadCompacted | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. ReadCompacted can only be enabled when reading from a persistent topic. | false| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if _, err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`OrderingKey` | OrderingKey sets the ordering key of the message. +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message +`DeliverAfter` | Request to deliver the message only after the specified relative delay +`DeliverAt` | Deliver the message only at or after the specified absolute timestamp + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## OAuth2 authentication + +To use [OAuth2 authentication](security-oauth2.md), you'll need to configure your client to perform the following operations. +This example shows how to configure OAuth2 authentication. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-java.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-java.md new file mode 100644 index 0000000000000..0b402f1cc456d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-java.md @@ -0,0 +1,1542 @@ +--- +id: client-libraries-java +title: Pulsar Java client +sidebar_label: "Java" +original_id: client-libraries-java +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +You can use a Pulsar Java client to create the Java [producer](#producer), [consumer](#consumer), [readers](#reader) and [TableView](#tableview) of messages and to perform [administrative tasks](admin-api-overview.md). The current Java client version is **@pulsar:version@**. + +All the methods in [producer](#producer), [consumer](#consumer), [readers](#reader) and [TableView](#tableview) of a Java client are thread-safe. + +Javadoc for the Pulsar client is divided into two domains by package as follows. + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | [The producer and consumer API](/api/client/) | [org.apache.pulsar:pulsar-client:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7C@pulsar:version@%7Cjar) +`org.apache.pulsar.client.all` |Include both `pulsar-client` and `pulsar-client-admin`
Both `pulsar-client` and `pulsar-client-admin` are shaded packages and they shade dependencies independently. Consequently, the applications using both `pulsar-client` and `pulsar-client-admin` have redundant shaded classes. It would be troublesome if you introduce new dependencies but forget to update shading rules.
In this case, you can use `pulsar-client-all`, which shades dependencies only one time and reduces the size of dependencies. |[org.apache.pulsar:pulsar-client-all:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-all%7C@pulsar:version@%7Cjar) + +This document focuses only on the client API for producing and consuming messages on Pulsar topics. For how to use the Java admin client, see [Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +:::tip + +- [`pulsar-client`](https://search.maven.org/artifact/org.apache.pulsar/pulsar-client) and [`pulsar-client-admin`](https://search.maven.org/artifact/org.apache.pulsar/pulsar-client-admin) shade dependencies via [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/) to avoid conflicts of the underlying dependency packages (such as Netty). If you do not want to manage dependency conflicts manually, you can use them. +- [`pulsar-client-original`](https://search.maven.org/artifact/org.apache.pulsar/pulsar-client-original) and [`pulsar-client-admin-original`](https://search.maven.org/artifact/org.apache.pulsar/pulsar-client-admin-original) **does not** shade dependencies. If you want to manage dependencies manually, you can use them. + +::: + +### Maven + +If you use Maven, add the following information to the `pom.xml` file. + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + + +``` + +### Gradle + +If you use Gradle, add the following information to the `build.gradle` file. + +```groovy + +def pulsarVersion = '@pulsar:version@' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +You can assign Pulsar protocol URLs to specific clusters and use the `pulsar` scheme. The default port is `6650`. The following is an example of `localhost`. + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, the URL is as follows. + +```http + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster is as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use [TLS](security-tls-authentication.md) authentication, the URL is as follows. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Client + +You can instantiate a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object using just a URL for the target Pulsar [cluster](reference-terminology.md#cluster) like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +``` + +If you have multiple brokers, you can initiate a PulsarClient like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650,localhost:6651,localhost:6652") + .build(); + +``` + +> ### Default broker URLs for standalone clusters +> If you run a cluster in [standalone mode](getting-started-standalone.md), the broker is available at the `pulsar://localhost:6650` URL by default. + +If you create a client, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Name | Type |
Description
| Default +|---|---|---|--- +`serviceUrl` | String | Service URL provider for Pulsar service | None +`authPluginClassName` | String | Name of the authentication plugin | None + `authParams` | String | Parameters for the authentication plugin

**Example**
key1:val1,key2:val2|None +`operationTimeoutMs`|long|`operationTimeoutMs`|Operation timeout |30000 +`statsIntervalSeconds`|long|Interval between each stats information

Stats is activated with positive `statsInterval`

Set `statsIntervalSeconds` to 1 second at least. |60 +`numIoThreads`| int| The number of threads used for handling connections to brokers | 1 +`numListenerThreads`|int|The number of threads used for handling message listeners. The listener thread pool is shared across all the consumers and readers using the "listener" model to get messages. For a given consumer, the listener is always invoked from the same thread to ensure ordering. If you want multiple threads to process a single topic, you need to create a [`shared`](concepts-messaging.md#shared) subscription and multiple consumers for this subscription. This does not ensure ordering.| 1 +`useTcpNoDelay`| boolean| Whether to use TCP no-delay flag on the connection to disable Nagle algorithm |true +`enableTls` |boolean | Whether to use TLS encryption on the connection. Note that this parameter is **deprecated**. If you want to enable TLS, use `pulsar+ssl://` in `serviceUrl` instead. | false + `tlsTrustCertsFilePath` |string |Path to the trusted TLS certificate file|None +`tlsAllowInsecureConnection`|boolean|Whether the Pulsar client accepts untrusted TLS certificate from broker | false +`tlsHostnameVerificationEnable` |boolean | Whether to enable TLS hostname verification|false +`concurrentLookupRequest`|int|The number of concurrent lookup requests allowed to send on each broker connection to prevent overload on broker|5000 +`maxLookupRequest`|int|The maximum number of lookup requests allowed on each broker connection to prevent overload on broker | 50000 +`maxNumberOfRejectedRequestPerConnection`|int|The maximum number of rejected requests of a broker in a certain time frame (30 seconds) after the current connection is closed and the client creates a new connection to connect to a different broker|50 +`keepAliveIntervalSeconds`|int|Seconds of keeping alive interval for each client broker connection|30 +`connectionTimeoutMs`|int|Duration of waiting for a connection to a broker to be established

If the duration passes without a response from a broker, the connection attempt is dropped|10000 +`requestTimeoutMs`|int|Maximum duration for completing a request |60000 +`defaultBackoffIntervalNanos`|int| Default duration for a backoff interval | TimeUnit.MILLISECONDS.toNanos(100); +`maxBackoffIntervalNanos`|long|Maximum duration for a backoff interval|TimeUnit.SECONDS.toNanos(30) +`socks5ProxyAddress`|SocketAddress|SOCKS5 proxy address | None +`socks5ProxyUsername`|string|SOCKS5 proxy username | None +`socks5ProxyPassword`|string|SOCKS5 proxy password | None + +Check out the Javadoc for the {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} class for a full list of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configure-producer) and [consumer](#configure-consumer) specific configuration as described in sections below. + +### Client memory allocator configuration +You can set the client memory allocator configurations through Java properties.
+ +| Property | Type |
Description
| Default | Available values +|---|---|---|---|--- +`pulsar.allocator.pooled` | String | If set to `true`, the client uses a direct memory pool.
If set to `false`, the client uses a heap memory without pool | true |
  • true
  • false
  • +`pulsar.allocator.exit_on_oom` | String | Whether to exit the JVM when OOM happens | false |
  • true
  • false
  • +`pulsar.allocator.leak_detection` | String | The leak detection policy for Pulsar bytebuf allocator.
  • **Disabled**: No leak detection and no overhead.
  • **Simple**: Instruments 1% of the allocated buffer to track for leaks.
  • **Advanced**: Instruments 1% of the allocated buffer to track for leaks, reporting stack traces of places where the buffer is used.
  • **Paranoid**: Instruments 100% of the allocated buffer to track for leaks, reporting stack traces of places where the buffer is used and introduces a significant overhead.
  • | Disabled |
  • Disabled
  • Simple
  • Advanced
  • Paranoid
  • +`pulsar.allocator.out_of_memory_policy` | String | When an OOM occurs, the client throws an exception or fallbacks to heap | FallbackToHeap |
  • ThrowException
  • FallbackToHeap
  • + +**Example**: + +``` + +-Dpulsar.allocator.pooled=true +-Dpulsar.allocator.exit_on_oom=false +-Dpulsar.allocator.leak_detection=Disabled +-Dpulsar.allocator.out_of_memory_policy=ThrowException + +``` + +### Cluster-level failover + +This chapter describes the concept, benefits, use cases, constraints, usage, working principles, and more information about the cluster-level failover. It contains the following sections: + +- [What is cluster-level failover?](#what-is-cluster-level-failover) + + * [Concept of cluster-level failover](#concept-of-cluster-level-failover) + + * [Why use cluster-level failover?](#why-use-cluster-level-failover) + + * [When to use cluster-level failover?](#when-to-use-cluster-level-failover) + + * [When cluster-level failover is triggered?](#when-cluster-level-failover-is-triggered) + + * [Why does cluster-level failover fail?](#why-does-cluster-level-failover-fail) + + * [What are the limitations of cluster-level failover?](#what-are-the-limitations-of-cluster-level-failover) + + * [What are the relationships between cluster-level failover and geo-replication?](#what-are-the-relationships-between-cluster-level-failover-and-geo-replication) + +- [How to use cluster-level failover?](#how-to-use-cluster-level-failover) + +- [How does cluster-level failover work?](#how-does-cluster-level-failover-work) + +> #### What is cluster-level failover + +This chapter helps you better understand the concept of cluster-level failover. +> ##### Concept of cluster-level failover + +````mdx-code-block + + + +Automatic cluster-level failover supports Pulsar clients switching from a primary cluster to one or several backup clusters automatically and seamlessly when it detects a failover event based on the configured detecting policy set by **users**. + +![Automatic cluster-level failover](/assets/cluster-level-failover-1.png) + + + + +Controlled cluster-level failover supports Pulsar clients switching from a primary cluster to one or several backup clusters. The switchover is manually set by **administrators**. + +![Controlled cluster-level failover](/assets/cluster-level-failover-2.png) + + + + +```` + +Once the primary cluster functions again, Pulsar clients can switch back to the primary cluster. Most of the time users won’t even notice a thing. Users can keep using applications and services without interruptions or timeouts. + +> ##### Why use cluster-level failover? + +The cluster-level failover provides fault tolerance, continuous availability, and high availability together. It brings a number of benefits, including but not limited to: + +* Reduced cost: services can be switched and recovered automatically with no data loss. + +* Simplified management: businesses can operate on an “always-on” basis since no immediate user intervention is required. + +* Improved stability and robustness: it ensures continuous performance and minimizes service downtime. + +> ##### When to use cluster-level failover? + +The cluster-level failover protects your environment in a number of ways, including but not limited to: + +* Disaster recovery: cluster-level failover can automatically and seamlessly transfer the production workload on a primary cluster to one or several backup clusters, which ensures minimum data loss and reduced recovery time. + +* Planned migration: if you want to migrate production workloads from an old cluster to a new cluster, you can improve the migration efficiency with cluster-level failover. For example, you can test whether the data migration goes smoothly in case of a failover event, identify possible issues and risks before the migration. + +> ##### When cluster-level failover is triggered? + +````mdx-code-block + + + +Automatic cluster-level failover is triggered when Pulsar clients cannot connect to the primary cluster for a prolonged period of time. This can be caused by any number of reasons including, but not limited to: + +* Network failure: internet connection is lost. + +* Power failure: shutdown time of a primary cluster exceeds time limits. + +* Service error: errors occur on a primary cluster (for example, the primary cluster does not function because of time limits). + +* Crashed storage space: the primary cluster does not have enough storage space, but the corresponding storage space on the backup server functions normally. + + + + +Controlled cluster-level failover is triggered when administrators set the switchover manually. + + + + +```` + +> ##### Why does cluster-level failover fail? + +Obviously, the cluster-level failover does not succeed if the backup cluster is unreachable by active Pulsar clients. This can happen for many reasons, including but not limited to: + +* Power failure: the backup cluster is shut down or does not function normally. + +* Crashed storage space: primary and backup clusters do not have enough storage space. + +* If the failover is initiated, but no cluster can assume the role of an available cluster due to errors, and the primary cluster is not able to provide service normally. + +* If you manually initiate a switchover, but services cannot be switched to the backup cluster server, then the system will attempt to switch services back to the primary cluster. + +* Fail to authenticate or authorize between 1) primary and backup clusters, or 2) between two backup clusters. + +> ##### What are the limitations of cluster-level failover? + +Currently, cluster-level failover can perform probes to prevent data loss, but it can not check the status of backup clusters. If backup clusters are not healthy, you cannot produce or consume data. + +> #### What are the relationships between cluster-level failover and geo-replication? + +The cluster-level failover is an extension of [geo-replication](concepts-replication.md) to improve stability and robustness. The cluster-level failover depends on geo-replication, and they have some **differences** as below. + +Influence |Cluster-level failover|Geo-replication +|---|---|--- +Do administrators have heavy workloads?|No or maybe.

    - For the **automatic** cluster-level failover, the cluster switchover is triggered automatically based on the policies set by **users**.

    - For the **controlled** cluster-level failover, the switchover is triggered manually by **administrators**.|Yes.

    If a cluster fails, immediate administration intervention is required.| +Result in data loss?|No.

    For both **automatic** and **controlled** cluster-level failover, if the failed primary cluster doesn't replicate messages immediately to the backup cluster, the Pulsar client can't consume the non-replicated messages. After the primary cluster is restored and the Pulsar client switches back, the non-replicated data can still be consumed by the Pulsar client. Consequently, the data is not lost.

    - For the **automatic** cluster-level failover, services can be switched and recovered automatically with no data loss.

    - For the **controlled** cluster-level failover, services can be switched and recovered manually and data loss may happen.|Yes.

    Pulsar clients and DNS systems have caches. When administrators switch the DNS from a primary cluster to a backup cluster, it takes some time for cache trigger timeout, which delays client recovery time and fails to produce or consume messages. +Result in Pulsar client failure? |No or maybe.

    - For **automatic** cluster-level failover, services can be switched and recovered automatically and the Pulsar client does not fail.

    - For **controlled** cluster-level failover, services can be switched and recovered manually, but the Pulsar client fails before administrators can take action. |Same as above. + +> #### How to use cluster-level failover + +This section guides you through every step on how to configure cluster-level failover. + +**Tip** + +- You should configure cluster-level failover only when the cluster contains sufficient resources to handle all possible consequences. Workload intensity on the backup cluster may increase significantly. + +- Connect clusters to an uninterruptible power supply (UPS) unit to reduce the risk of unexpected power loss. + +**Requirements** + +* Pulsar client 2.10 or later versions. + +* For backup clusters: + + * The number of BookKeeper nodes should be equal to or greater than the ensemble quorum. + + * The number of ZooKeeper nodes should be equal to or greater than 3. + +* **Turn on geo-replication** between the primary cluster and any dependent cluster (primary to backup or backup to backup) to prevent data loss. + +* Set `replicateSubscriptionState` to `true` when creating consumers. + +````mdx-code-block + + + +This is an example of how to construct a Java Pulsar client to use automatic cluster-level failover. The switchover is triggered automatically. + +``` + +  private PulsarClient getAutoFailoverClient() throws PulsarClientException { + +        ServiceUrlProvider failover = AutoClusterFailover.builder() +                .primary("pulsar://localhost:6650") +                .secondary(Collections.singletonList("pulsar://other1:6650","pulsar://other2:6650")) +                .failoverDelay(30, TimeUnit.SECONDS) +                .switchBackDelay(60, TimeUnit.SECONDS) +                .checkInterval(1000, TimeUnit.MILLISECONDS) +         .secondaryTlsTrustCertsFilePath("/path/to/ca.cert.pem") +    .secondaryAuthentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", +"tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + +                .build(); + +        PulsarClient pulsarClient = PulsarClient.builder() +                .build(); + +        failover.initialize(pulsarClient); +        return pulsarClient; +    } + +``` + +Configure the following parameters: + +Parameter|Default value|Required?|Description +|---|---|---|--- +`primary`|N/A|Yes|Service URL of the primary cluster. +`secondary`|N/A|Yes|Service URL(s) of one or several backup clusters.

    You can specify several backup clusters using a comma-separated list.

    Note that:
    - The backup cluster is chosen in the sequence shown in the list.
    - If all backup clusters are available, the Pulsar client chooses the first backup cluster. +`failoverDelay`|N/A|Yes|The delay before the Pulsar client switches from the primary cluster to the backup cluster.

    Automatic failover is controlled by a probe task:
    1) The probe task first checks the health status of the primary cluster.
    2) If the probe task finds the continuous failure time of the primary cluster exceeds `failoverDelayMs`, it switches the Pulsar client to the backup cluster. +`switchBackDelay`|N/A|Yes|The delay before the Pulsar client switches from the backup cluster to the primary cluster.

    Automatic failover switchover is controlled by a probe task:
    1) After the Pulsar client switches from the primary cluster to the backup cluster, the probe task continues to check the status of the primary cluster.
    2) If the primary cluster functions well and continuously remains active longer than `switchBackDelay`, the Pulsar client switches back to the primary cluster. +`checkInterval`|30s|No|Frequency of performing a probe task (in seconds). +`secondaryTlsTrustCertsFilePath`|N/A|No|Path to the trusted TLS certificate file of the backup cluster. +`secondaryAuthentication`|N/A|No|Authentication of the backup cluster. + +
    + + +This is an example of how to construct a Java Pulsar client to use controlled cluster-level failover. The switchover is triggered by administrators manually. + +**Note**: you can have one or several backup clusters but can only specify one. + +``` + + public PulsarClient getControlledFailoverClient() throws IOException { +Map header = new HashMap(); + header.put(“service_user_id”, “my-user”); + header.put(“service_password”, “tiger”); + header.put(“clusterA”, “tokenA”); + header.put(“clusterB”, “tokenB”); + + ServiceUrlProvider provider = + ControlledClusterFailover.builder() + .defaultServiceUrl("pulsar://localhost:6650") + .checkInterval(1, TimeUnit.MINUTES) + .urlProvider("http://localhost:8080/test") + .urlProviderHeader(header) + .build(); + + PulsarClient pulsarClient = + PulsarClient.builder() + .build(); + + provider.initialize(pulsarClient); + return pulsarClient; +} + +``` + +Parameter|Default value|Required?|Description +|---|---|---|--- +`defaultServiceUrl`|N/A|Yes|Pulsar service URL. +`checkInterval`|30s|No|Frequency of performing a probe task (in seconds). +`urlProvider`|N/A|Yes|URL provider service. +`urlProviderHeader`|N/A|No|`urlProviderHeader` is a map containing tokens and credentials.

    If you enable authentication or authorization between Pulsar clients and primary and backup clusters, you need to provide `urlProviderHeader`. + +Here is an example of how `urlProviderHeader` works. + +![How urlProviderHeader works](/assets/cluster-level-failover-3.png) + +Assume that you want to connect Pulsar client 1 to cluster A. + +1. Pulsar client 1 sends the token *t1* to the URL provider service. + +2. The URL provider service returns the credential *c1* and the cluster A URL to the Pulsar client. + + The URL provider service manages all tokens and credentials. It returns different credentials based on different tokens and different target cluster URLs to different Pulsar clients. + + **Note**: **the credential must be in a JSON file and contain parameters as shown**. + + ``` + + { + "serviceUrl": "pulsar+ssl://target:6651", + "tlsTrustCertsFilePath": "/security/ca.cert.pem", + "authPluginClassName":"org.apache.pulsar.client.impl.auth.AuthenticationTls", + "authParamsString": " \"tlsCertFile\": \"/security/client.cert.pem\" + \"tlsKeyFile\": \"/security/client-pk8.pem\" " + } + + ``` + +3. Pulsar client 1 connects to cluster A using credential *c1*. + +
    + +
    +```` + +>#### How does cluster-level failover work? + +This chapter explains the working process of cluster-level failover. For more implementation details, see [PIP-121](https://github.com/apache/pulsar/issues/13315). + +````mdx-code-block + + + +In automatic failover cluster, the primary cluster and backup cluster are aware of each other's availability. The automatic failover cluster performs the following actions without administrator intervention: + +1. The Pulsar client runs a probe task at intervals defined in `checkInterval`. + +2. If the probe task finds the failure time of the primary cluster exceeds the time set in the `failoverDelay` parameter, it searches backup clusters for an available healthy cluster. + + 2a) If there are healthy backup clusters, the Pulsar client switches to a backup cluster in the order defined in `secondary`. + + 2b) If there is no healthy backup cluster, the Pulsar client does not perform the switchover, and the probe task continues to look for an available backup cluster. + +3. The probe task checks whether the primary cluster functions well or not. + + 3a) If the primary cluster comes back and the continuous healthy time exceeds the time set in `switchBackDelay`, the Pulsar client switches back to the primary cluster. + + 3b) If the primary cluster does not come back, the Pulsar client does not perform the switchover. + +![Workflow of automatic failover cluster](/assets/cluster-level-failover-4.png) + + + + +1. The Pulsar client runs a probe task at intervals defined in `checkInterval`. + +2. The probe task fetches the service URL configuration from the URL provider service, which is configured by `urlProvider`. + + 2a) If the service URL configuration is changed, the probe task switches to the target cluster without checking the health status of the target cluster. + + 2b) If the service URL configuration is not changed, the Pulsar client does not perform the switchover. + +3. If the Pulsar client switches to the target cluster, the probe task continues to fetch service URL configuration from the URL provider service at intervals defined in `checkInterval`. + + 3a) If the service URL configuration is changed, the probe task switches to the target cluster without checking the health status of the target cluster. + + 3b) If the service URL configuration is not changed, it does not perform the switchover. + +![Workflow of controlled failover cluster](/assets/cluster-level-failover-5.png) + + + + +```` + +## Producer + +In Pulsar, producers write messages to topics. Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object (as in the section [above](#client-configuration)), you can create a {@inject: javadoc:Producer:/client/org/apache/pulsar/client/api/Producer} for a specific Pulsar [topic](reference-terminology.md#topic). + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); + +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types by specifying a message [schema](#schema). + +```java + +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); + +``` + +> Make sure that you close your producers, consumers, and clients when you do not need them. + +> ```java +> +> producer.close(); +> consumer.close(); +> client.close(); +> +> +> ``` + +> +> Close operations can also be asynchronous: + +> ```java +> +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")) +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return null; +> }); +> +> +> ``` + + +### Configure producer + +If you instantiate a `Producer` object by specifying only a topic name as the example above, the default configuration of producer is used. + +If you create a producer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Name| Type |
    Description
    | Default +|---|---|---|--- +`topicName`| string| Topic name| null| +`producerName`| string|Producer name| null +`sendTimeoutMs`| long|Message send timeout in ms.
    If a message is not acknowledged by a server before the `sendTimeout` expires, an error occurs.|30000 +`blockIfQueueFull`|boolean|If it is set to `true`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer block, rather than failing and throwing errors.
    If it is set to `false`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer fail and `ProducerQueueIsFullError` exceptions occur.

    The `MaxPendingMessages` parameter determines the size of the outgoing message queue.|false +`maxPendingMessages`| int|The maximum size of a queue holding pending messages.

    For example, a message waiting to receive an acknowledgment from a [broker](reference-terminology.md#broker).

    By default, when the queue is full, all calls to the `Send` and `SendAsync` methods fail **unless** you set `BlockIfQueueFull` to `true`.|1000 +`maxPendingMessagesAcrossPartitions`|int|The maximum number of pending messages across partitions.

    Use the setting to lower the max pending messages for each partition ({@link #setMaxPendingMessages(int)}) if the total number exceeds the configured value.|50000 +`messageRoutingMode`| MessageRoutingMode|Message routing logic for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics).
    Apply the logic only when setting no key on messages.
    Available options are as follows:
  • `pulsar.RoundRobinDistribution`: round robin
  • `pulsar.UseSinglePartition`: publish all messages to a single partition
  • `pulsar.CustomPartition`: a custom partitioning scheme
  • |
  • `pulsar.RoundRobinDistribution`
  • +`hashingScheme`| HashingScheme|Hashing function determining the partition where you publish a particular message (**partitioned topics only**).
    Available options are as follows:
  • `pulsar.JavastringHash`: the equivalent of `string.hashCode()` in Java
  • `pulsar.Murmur3_32Hash`: applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function
  • `pulsar.BoostHash`: applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library
  • |`HashingScheme.JavastringHash` +`cryptoFailureAction`| ProducerCryptoFailureAction|Producer should take action when encryption fails.
  • **FAIL**: if encryption fails, unencrypted messages fail to send.
  • **SEND**: if encryption fails, unencrypted messages are sent.
  • |`ProducerCryptoFailureAction.FAIL` +`batchingMaxPublishDelayMicros`| long|Batching time period of sending messages.|TimeUnit.MILLISECONDS.toMicros(1) +`batchingMaxMessages` |int|The maximum number of messages permitted in a batch.|1000 +`batchingEnabled`| boolean|Enable batching of messages. |true +`chunkingEnabled` | boolean | Enable chunking of messages. |false +`compressionType`|CompressionType|Message data compression type used by a producer.
    Available options:
  • [`LZ4`](https://github.com/lz4/lz4)
  • [`ZLIB`](https://zlib.net/)
  • [`ZSTD`](https://facebook.github.io/zstd/)
  • [`SNAPPY`](https://google.github.io/snappy/)
  • | No compression +`initialSubscriptionName`|string|Use this configuration to automatically create an initial subscription when creating a topic. If this field is not set, the initial subscription is not created.|null + +You can configure parameters if you do not want to use the default configuration. + +For a full list, see the Javadoc for the {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder} class. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); + +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more information on specifying a routing mode using the Java client, see the [Partitioned Topics cookbook](cookbooks-partitioned.md). + +### Async send + +You can publish messages [asynchronously](concepts-messaging.md#send-modes) using the Java client. With async send, the producer puts the message in a blocking queue and returns it immediately. Then the client library sends the message to the broker in the background. If the queue is full (max size configurable), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. + +The following is an example. + +```java + +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.println("Message with ID " + msgId + " successfully sent"); +}); + +``` + +As you can see from the example above, async send operations return a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configure messages + +In addition to a value, you can set additional items on a given message: + +```java + +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); + +``` + +You can terminate the builder chain with `sendAsync()` and get a future return. + +### Enable chunking + +Message [chunking](concepts-messaging.md#chunking) enables Pulsar to process large payload messages by splitting the message into chunks at the producer side and aggregating chunked messages at the consumer side. + +The message chunking feature is OFF by default. The following is an example about how to enable message chunking when creating a producer. + +```java + +Producer producer = client.newProducer() + .topic(topic) + .enableChunking(true) + .enableBatching(false) + .create(); + +``` + +By default, producer chunks the large message based on max message size (`maxMessageSize`) configured at broker (eg: 5MB). However, client can also configure max chunked size using producer configuration `chunkMaxMessageSize`. +> **Note:** To enable chunking, you need to disable batching (`enableBatching`=`false`) concurrently. + +## Consumer + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new [consumer](reference-terminology.md#consumer) by first instantiating a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object, you can create a {@inject: javadoc:Consumer:/client/org/apache/pulsar/client/api/Consumer} by specifying a [topic](reference-terminology.md#topic) and a [subscription](concepts-messaging.md#subscription-types). + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); + +``` + +The `subscribe` method will auto subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any received message, and then [acknowledges](reference-terminology.md#acknowledgment-ack) that the message has been processed. If the processing logic fails, you can use [negative acknowledgement](reference-terminology.md#acknowledgment-ack) to redeliver the message later. + +```java + +while (true) { + // Wait for a message + Message msg = consumer.receive(); + + try { + // Do something with the message + System.out.println("Message received: " + new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); + } catch (Exception e) { + // Message failed to process, redeliver later + consumer.negativeAcknowledge(msg); + } +} + +``` + +If you don't want to block your main thread and rather listen constantly for new messages, consider using a `MessageListener`. + +```java + +MessageListener myMessageListener = (consumer, msg) -> { + try { + System.out.println("Message received: " + new String(msg.getData())); + consumer.acknowledge(msg); + } catch (Exception e) { + consumer.negativeAcknowledge(msg); + } +} + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .messageListener(myMessageListener) + .subscribe(); + +``` + +### Configure consumer + +If you instantiate a `Consumer` object by specifying only a topic and subscription name as in the example above, the consumer uses the default configuration. + +When you create a consumer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + + Name|Type |
    Description
    | Default +|---|---|---|--- +`topicNames`| Set<String>| Topic name| Sets.newTreeSet() + `topicsPattern`|Pattern| Topic pattern |None +`subscriptionName`|String| Subscription name| None +`subscriptionType`|SubscriptionType| Subscription type
    Four subscription types are available:
  • Exclusive
  • Failover
  • Shared
  • Key_Shared
  • |SubscriptionType.Exclusive +`receiverQueueSize` |int | Size of a consumer's receiver queue.

    For example, the number of messages accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.| 1000 +`acknowledgementsGroupTimeMicros`|long|Group a consumer acknowledgment for a specified time.

    By default, a consumer uses 100ms grouping time to send out acknowledgments to a broker.

    Setting a group time of 0 sends out acknowledgments immediately.

    A longer ack group time is more efficient at the expense of a slight increase in message re-deliveries after a failure.|TimeUnit.MILLISECONDS.toMicros(100) +`negativeAckRedeliveryDelayMicros`|long|Delay to wait before redelivering messages that failed to be processed.

    When an application uses {@link Consumer#negativeAcknowledge(Message)}, failed messages are redelivered after a fixed timeout. |TimeUnit.MINUTES.toMicros(1) +`maxTotalReceiverQueueSizeAcrossPartitions`|int |The max total receiver queue size across partitions.

    This setting reduces the receiver queue size for individual partitions if the total receiver queue size exceeds this value.|50000 +`consumerName`|String|Consumer name|null +`ackTimeoutMillis`|long|Timeout of unacked messages|0 +`tickDurationMillis`|long|Granularity of the ack-timeout redelivery.

    Using an higher `tickDurationMillis` reduces the memory overhead to track messages when setting ack-timeout to a bigger value (for example, 1 hour).|1000 +`priorityLevel`|int|Priority level for a consumer to which a broker gives more priority while dispatching messages in Shared subscription type.

    The broker follows descending priorities. For example, 0=max-priority, 1, 2,...

    In Shared subscription type, the broker **first dispatches messages to the max priority level consumers if they have permits**. Otherwise, the broker considers next priority level consumers.

    **Example 1**
    If a subscription has consumerA with `priorityLevel` 0 and consumerB with `priorityLevel` 1, then the broker **only dispatches messages to consumerA until it runs out permits** and then starts dispatching messages to consumerB.

    **Example 2**
    Consumer Priority, Level, Permits
    C1, 0, 2
    C2, 0, 1
    C3, 0, 1
    C4, 1, 2
    C5, 1, 1

    Order in which a broker dispatches messages to consumers is: C1, C2, C3, C1, C4, C5, C4.|0 +`cryptoFailureAction`|ConsumerCryptoFailureAction|Consumer should take action when it receives a message that can not be decrypted.
  • **FAIL**: this is the default option to fail messages until crypto succeeds.
  • **DISCARD**:silently acknowledge and not deliver message to an application.
  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

  • The decompression of message fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|
  • ConsumerCryptoFailureAction.FAIL
  • +`properties`|SortedMap|A name or value property of this consumer.

    `properties` is application defined metadata attached to a consumer.

    When getting a topic stats, associate this metadata with the consumer stats for easier identification.|new TreeMap() +`readCompacted`|boolean|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    Only enabling `readCompacted` on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +`subscriptionInitialPosition`|SubscriptionInitialPosition|Initial position at which to set cursor when subscribing to a topic at first time.|SubscriptionInitialPosition.Latest +`patternAutoDiscoveryPeriod`|int|Topic auto discovery period when using a pattern for topic's consumer.

    The default and minimum value is 1 minute.|1 +`regexSubscriptionMode`|RegexSubscriptionMode|When subscribing to a topic using a regular expression, you can pick a certain type of topics.

  • **PersistentOnly**: only subscribe to persistent topics.
  • **NonPersistentOnly**: only subscribe to non-persistent topics.
  • **AllTopics**: subscribe to both persistent and non-persistent topics.
  • |RegexSubscriptionMode.PersistentOnly +`deadLetterPolicy`|DeadLetterPolicy|Dead letter policy for consumers.

    By default, some messages are probably redelivered many times, even to the extent that it never stops.

    By using the dead letter mechanism, messages have the max redelivery count. **When exceeding the maximum number of redeliveries, messages are sent to the Dead Letter Topic and acknowledged automatically**.

    You can enable the dead letter mechanism by setting `deadLetterPolicy`.

    **Example**

    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10).build())
    .subscribe();


    Default dead letter topic name is `{TopicName}-{Subscription}-DLQ`.

    To set a custom dead letter topic name:
    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10)
    .deadLetterTopic("your-topic-name").build())
    .subscribe();


    When specifying the dead letter policy while not specifying `ackTimeoutMillis`, you can set the ack timeout to 30000 millisecond.|None +`autoUpdatePartitions`|boolean|If `autoUpdatePartitions` is enabled, a consumer subscribes to partition increasement automatically.

    **Note**: this is only for partitioned consumers.|true +`replicateSubscriptionState`|boolean|If `replicateSubscriptionState` is enabled, a subscription state is replicated to geo-replicated clusters.|false +`negativeAckRedeliveryBackoff`|RedeliveryBackoff|Interface for custom message is negativeAcked policy. You can specify `RedeliveryBackoff` for a consumer.| `MultiplierRedeliveryBackoff` +`ackTimeoutRedeliveryBackoff`|RedeliveryBackoff|Interface for custom message is ackTimeout policy. You can specify `RedeliveryBackoff` for a consumer.| `MultiplierRedeliveryBackoff` +`autoAckOldestChunkedMessageOnQueueFull`|boolean|Whether to automatically acknowledge pending chunked messages when the threashold of `maxPendingChunkedMessage` is reached. If set to `false`, these messages will be redelivered by their broker. |true +`maxPendingChunkedMessage`|int| The maximum size of a queue holding pending chunked messages. When the threshold is reached, the consumer drops pending messages to optimize memory utilization.|10 +`expireTimeOfIncompleteChunkedMessageMillis`|long|The time interval to expire incomplete chunks if a consumer fails to receive all the chunks in the specified time period. The default value is 1 minute. | 60000 + +You can configure parameters if you do not want to use the default configuration. For a full list, see the Javadoc for the {@inject: javadoc:ConsumerBuilder:/client/org/apache/pulsar/client/api/ConsumerBuilder} class. + +The following is an example. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); + +``` + +### Async receive + +The `receive` method receives messages synchronously (the consumer process is blocked until a message is available). You can also use [async receive](concepts-messaging.md#receive-modes), which returns a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object immediately once a new message is available. + +The following is an example. + +```java + +CompletableFuture asyncMessage = consumer.receiveAsync(); + +``` + +Async receive operations return a {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Batch receive + +Use `batchReceive` to receive multiple messages for each call. + +The following is an example. + +```java + +Messages messages = consumer.batchReceive(); +for (Object message : messages) { + // do something +} +consumer.acknowledge(messages) + +``` + +:::note + +Batch receive policy limits the number and bytes of messages in a single batch. You can specify a timeout to wait for enough messages. +The batch receive is completed if any of the following condition is met: enough number of messages, bytes of messages, wait timeout. + +```java + +Consumer consumer = client.newConsumer() +.topic("my-topic") +.subscriptionName("my-subscription") +.batchReceivePolicy(BatchReceivePolicy.builder() +.maxNumMessages(100) +.maxNumBytes(1024 * 1024) +.timeout(200, TimeUnit.MILLISECONDS) +.build()) +.subscribe(); + +``` + +The default batch receive policy is: + +```java + +BatchReceivePolicy.builder() +.maxNumMessage(-1) +.maxNumBytes(10 * 1024 * 1024) +.timeout(100, TimeUnit.MILLISECONDS) +.build(); + +``` + +::: + +### Configure chunking + +You can limit the maximum number of chunked messages a consumer maintains concurrently by configuring the `maxPendingChunkedMessage` and `autoAckOldestChunkedMessageOnQueueFull` parameters. When the threshold is reached, the consumer drops pending messages by silently acknowledging them or asking the broker to redeliver them later. The `expireTimeOfIncompleteChunkedMessage` parameter decides the time interval to expire incomplete chunks if the consumer fails to receive all chunks of a message within the specified time period. + +The following is an example of how to configure message chunking. + +```java + +Consumer consumer = client.newConsumer() + .topic(topic) + .subscriptionName("test") + .autoAckOldestChunkedMessageOnQueueFull(true) + .maxPendingChunkedMessage(100) + .expireTimeOfIncompleteChunkedMessage(10, TimeUnit.MINUTES) + .subscribe(); + +``` + +### Negative acknowledgment redelivery backoff + +The `RedeliveryBackoff` introduces a redelivery backoff mechanism. You can achieve redelivery with different delays by setting `redeliveryCount ` of messages. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .negativeAckRedeliveryBackoff(MultiplierRedeliveryBackoff.builder() + .minDelayMs(1000) + .maxDelayMs(60 * 1000) + .build()) + .subscribe(); + +``` + +### Acknowledgement timeout redelivery backoff + +The `RedeliveryBackoff` introduces a redelivery backoff mechanism. You can redeliver messages with different delays by setting the number +of times the messages is retried. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECOND) + .ackTimeoutRedeliveryBackoff(MultiplierRedeliveryBackoff.builder() + .minDelayMs(1000) + .maxDelayMs(60000) + .multiplier(2) + .build()) + .subscribe(); + +``` + +The message redelivery behavior should be as follows. + +Redelivery count | Redelivery delay +:--------------------|:----------- +1 | 10 + 1 seconds +2 | 10 + 2 seconds +3 | 10 + 4 seconds +4 | 10 + 8 seconds +5 | 10 + 16 seconds +6 | 10 + 32 seconds +7 | 10 + 60 seconds +8 | 10 + 60 seconds + +:::note + +- The `negativeAckRedeliveryBackoff` does not work with `consumer.negativeAcknowledge(MessageId messageId)` because you are not able to get the redelivery count from the message ID. +- If a consumer crashes, it triggers the redelivery of unacked messages. In this case, `RedeliveryBackoff` does not take effect and the messages might get redelivered earlier than the delay time from the backoff. + +::: + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](concepts-messaging.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The followings are some examples. + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); + +``` + +In the above example, the consumer subscribes to the `persistent` topics that can match the topic name pattern. If you want the consumer subscribes to all `persistent` and `non-persistent` topics that can match the topic name pattern, set `subscriptionTopicsMode` to `RegexSubscriptionMode.AllTopics`. + +```java + +Pattern pattern = Pattern.compile("public/default/.*"); +pulsarClient.newConsumer() + .subscriptionName("my-sub") + .topicsPattern(pattern) + .subscriptionTopicsMode(RegexSubscriptionMode.AllTopics) + .subscribe(); + +``` + +:::note + +By default, the `subscriptionTopicsMode` of the consumer is `PersistentOnly`. Available options of `subscriptionTopicsMode` are `PersistentOnly`, `NonPersistentOnly`, and `AllTopics`. + +::: + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java + +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topic( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); + +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. The following is an example. + +```java + +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(this::receiveMessageFromConsumer); + +private void receiveMessageFromConsumer(Object consumer) { + ((Consumer)consumer).receiveAsync().thenAccept(message -> { + // Do something with the received message + receiveMessageFromConsumer(consumer); + }); +} + +``` + +### Subscription types + +Pulsar has various [subscription types](concepts-messaging#subscription-types) to match different scenarios. A topic can have multiple subscriptions with different subscription types. However, a subscription can only have one subscription type at a time. + +A subscription is identical with the subscription name; a subscription name can specify only one subscription type at a time. To change the subscription type, you should first stop all consumers of this subscription. + +Different subscription types have different message distribution types. This section describes the differences of subscription types and how to use them. + +In order to better describe their differences, assuming you have a topic named "my-topic", and the producer has published 10 messages. + +```java + +Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .enableBatching(false) + .create(); +// 3 messages with "key-1", 3 messages with "key-2", 2 messages with "key-3" and 2 messages with "key-4" +producer.newMessage().key("key-1").value("message-1-1").send(); +producer.newMessage().key("key-1").value("message-1-2").send(); +producer.newMessage().key("key-1").value("message-1-3").send(); +producer.newMessage().key("key-2").value("message-2-1").send(); +producer.newMessage().key("key-2").value("message-2-2").send(); +producer.newMessage().key("key-2").value("message-2-3").send(); +producer.newMessage().key("key-3").value("message-3-1").send(); +producer.newMessage().key("key-3").value("message-3-2").send(); +producer.newMessage().key("key-4").value("message-4-1").send(); +producer.newMessage().key("key-4").value("message-4-2").send(); + +``` + +#### Exclusive + +Create a new consumer and subscribe with the `Exclusive` subscription type. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Exclusive) + .subscribe() + +``` + +Only the first consumer is allowed to the subscription, other consumers receive an error. The first consumer receives all 10 messages, and the consuming order is the same as the producing order. + +:::note + +If topic is a partitioned topic, the first consumer subscribes to all partitioned topics, other consumers are not assigned with partitions and receive an error. + +::: + +#### Failover + +Create new consumers and subscribe with the`Failover` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +//conumser1 is the active consumer, consumer2 is the standby consumer. +//consumer1 receives 5 messages and then crashes, consumer2 takes over as an active consumer. + +``` + +Multiple consumers can attach to the same subscription, yet only the first consumer is active, and others are standby. When the active consumer is disconnected, messages will be dispatched to one of standby consumers, and the standby consumer then becomes active consumer. + +If the first active consumer is disconnected after receiving 5 messages, the standby consumer becomes active consumer. Consumer1 will receive: + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-2", "message-2-1") +("key-2", "message-2-2") + +``` + +consumer2 will receive: + +``` + +("key-2", "message-2-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +:::note + +If a topic is a partitioned topic, each partition has only one active consumer, messages of one partition are distributed to only one consumer, and messages of multiple partitions are distributed to multiple consumers. + +::: + +#### Shared + +Create new consumers and subscribe with `Shared` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +In Shared subscription type, multiple consumers can attach to the same subscription and messages are delivered in a round robin distribution across consumers. + +If a broker dispatches only one message at a time, consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-3") +("key-2", "message-2-2") +("key-3", "message-3-1") +("key-4", "message-4-1") + +``` + +consumer2 receives the following information. + +``` + +("key-1", "message-1-2") +("key-2", "message-2-1") +("key-2", "message-2-3") +("key-3", "message-3-2") +("key-4", "message-4-2") + +``` + +`Shared` subscription is different from `Exclusive` and `Failover` subscription types. `Shared` subscription has better flexibility, but cannot provide order guarantee. + +#### Key_shared + +This is a new subscription type since 2.4.0 release. Create new consumers and subscribe with `Key_Shared` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +Just like in `Shared` subscription, all consumers in `Key_Shared` subscription type can attach to the same subscription. But `Key_Shared` subscription type is different from the `Shared` subscription. In `Key_Shared` subscription type, messages with the same key are delivered to only one consumer in order. The possible distribution of messages between different consumers (by default we do not know in advance which keys will be assigned to a consumer, but a key will only be assigned to a consumer at the same time). + +consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-3", "message-3-1") +("key-3", "message-3-2") + +``` + +consumer2 receives the following information. + +``` + +("key-2", "message-2-1") +("key-2", "message-2-2") +("key-2", "message-2-3") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +If batching is enabled at the producer side, messages with different keys are added to a batch by default. The broker will dispatch the batch to the consumer, so the default batch mechanism may break the Key_Shared subscription guaranteed message distribution semantics. The producer needs to use the `KeyBasedBatcher`. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batcherBuilder(BatcherBuilder.KEY_BASED) + .create(); + +``` + +Or the producer can disable batching. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .enableBatching(false) + .create(); + +``` + +:::note + +If the message key is not specified, messages without key are dispatched to one consumer in order by default. + +::: + +## Reader + +With the [reader interface](concepts-clients.md#reader-interface), Pulsar clients can "manually position" themselves within a topic and reading all messages from a specified message onward. The Pulsar API for Java enables you to create {@inject: javadoc:Reader:/client/org/apache/pulsar/client/api/Reader} objects by specifying a topic and a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId}. + +The following is an example. + +```java + +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} + +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader iterates over each message in the topic after the message is identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +### Configure reader +When you create a reader, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Name | Type|
    Description
    | Default +|---|---|---|--- +`topicName`|String|Topic name. |None +`receiverQueueSize`|int|Size of a consumer's receiver queue.

    For example, the number of messages that can be accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.|1000 +`readerListener`|ReaderListener<T>|A listener that is called for message received.|None +`readerName`|String|Reader name.|null +`subscriptionName`|String| Subscription name|When there is a single topic, the default subscription name is `"reader-" + 10-digit UUID`.
    When there are multiple topics, the default subscription name is `"multiTopicsReader-" + 10-digit UUID`. +`subscriptionRolePrefix`|String|Prefix of subscription role. |null +`cryptoKeyReader`|CryptoKeyReader|Interface that abstracts the access to a key store.|null +`cryptoFailureAction`|ConsumerCryptoFailureAction|Consumer should take action when it receives a message that can not be decrypted.
  • **FAIL**: this is the default option to fail messages until crypto succeeds.
  • **DISCARD**: silently acknowledge and not deliver message to an application.
  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

  • The message decompression fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|
  • ConsumerCryptoFailureAction.FAIL
  • +`readCompacted`|boolean|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (for example, failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +`resetIncludeHead`|boolean|If set to true, the first message to be returned is the one specified by `messageId`.

    If set to false, the first message to be returned is the one next to the message specified by `messageId`.|false + +### Sticky key range reader + +In sticky key range reader, broker will only dispatch messages which hash of the message key contains by the specified key hash range. Multiple key hash ranges can be specified on a reader. + +The following is an example to create a sticky key range reader. + +```java + +pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.earliest) + .keyHashRange(Range.of(0, 10000), Range.of(20001, 30000)) + .create(); + +``` + +Total hash range size is 65536, so the max end of the range should be less than or equal to 65535. + + +## TableView + +The TableView interface serves an encapsulated access pattern, providing a continuously updated key-value map view of the compacted topic data. Messages without keys will be ignored. + +With TableView, Pulsar clients can fetch all the message updates from a topic and construct a map with the latest values of each key. These values can then be used to build a local cache of data. In addition, you can register consumers with the TableView by specifying a listener to perform a scan of the map and then receive notifications when new messages are received. Consequently, event handling can be triggered to serve use cases, such as event-driven applications and message monitoring. + +> **Note:** Each TableView uses one Reader instance per partition, and reads the topic starting from the compacted view by default. It is highly recommended to enable automatic compaction by [configuring the topic compaction policies](cookbooks-compaction.md#configuring-compaction-to-run-automatically) for the given topic or namespace. More frequent compaction results in shorter startup times because less data is replayed to reconstruct the TableView of the topic. + +The following figure illustrates the dynamic construction of a TableView updated with newer values of each key. +![TableView](/assets/tableview.png) + +### Configure TableView + +The following is an example of how to configure a TableView. + +```java + +TableView tv = client.newTableViewBuilder(Schema.STRING) + .topic("my-tableview") + .create() + +``` + +You can use the available parameters in the `loadConf` configuration or related [API](/api/client/2.10.0-SNAPSHOT/org/apache/pulsar/client/api/TableViewBuilder.html) to customize your TableView. + +| Name | Type| Required? |
    Description
    | Default +|---|---|---|---|--- +| `topic` | string | yes | The topic name of the TableView. | N/A +| `autoUpdatePartitionInterval` | int | no | The interval to check for newly added partitions. | 60 (seconds) + +### Register listeners + +You can register listeners for both existing messages on a topic and new messages coming into the topic by using `forEachAndListen`, and specify to perform operations for all existing messages by using `forEach`. + +The following is an example of how to register listeners with TableView. + +```java + +// Register listeners for all existing and incoming messages +tv.forEachAndListen((key, value) -> /*operations on all existing and incoming messages*/) + +// Register action for all existing messages +tv.forEach((key, value) -> /*operations on all existing messages*/) + +``` + +## Schema + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](schema-get-started.md) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producer) without specifying a schema, then the producer can only produce messages of type `byte[]`. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic(topic) + .create(); + +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the [topic](reference-terminology.md#topic). + +### AvroBaseStructSchema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java + +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} + +``` + +You could then create a `Producer` (or `Consumer`) like this: + +```java + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); + +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + + ``` + + Or, equivalently: + + ```java + + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + + ``` + +* `String` for normal UTF-8-encoded string data. Apply the schema using `Schema.STRING`: + + ```java + + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + + ``` + +* Create JSON schemas for POJOs using `Schema.JSON`. The following is an example. + + ```java + + Producer pojoProducer = client.newProducer(Schema.JSON(MyPojo.class)) + .topic("some-pojo-topic") + .create(); + + ``` + +* Generate Protobuf schemas using `Schema.PROTOBUF`. The following example shows how to create the Protobuf schema and use it to instantiate a new producer: + + ```java + + Producer protobufProducer = client.newProducer(Schema.PROTOBUF(MyProtobuf.class)) + .topic("some-protobuf-topic") + .create(); + + ``` + +* Define Avro schemas with `Schema.AVRO`. The following code snippet demonstrates how to create and use Avro schema. + + ```java + + Producer avroProducer = client.newProducer(Schema.AVRO(MyAvro.class)) + .topic("some-avro-topic") + .create(); + + ``` + +### ProtobufNativeSchema example + +For example of ProtobufNativeSchema, see [`SchemaDefinition` in `Complex type`](schema-understand.md#complex-type). + +## Authentication + +Pulsar currently supports three authentication schemes: [TLS](security-tls-authentication.md), [Athenz](security-athenz.md), and [Oauth2](security-oauth2.md). You can use the Pulsar Java client with all of them. + +### TLS Authentication + +To use [TLS](security-tls-authentication.md), `enableTls` method is deprecated and you need to use "pulsar+ssl://" in serviceUrl to enable, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); + +``` + +### Athenz + +To use [Athenz](security-athenz.md) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); + +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` + +### Oauth2 + +The following example shows how to use [Oauth2](security-oauth2.md) as an authentication provider for the Pulsar Java client. + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(this.issuerUrl, this.credentialsUrl, this.audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"...","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-node.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-node.md new file mode 100644 index 0000000000000..a023b51d8ceb0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-node.md @@ -0,0 +1,652 @@ +--- +id: client-libraries-node +title: The Pulsar Node.js client +sidebar_label: "Node.js" +original_id: client-libraries-node +--- + +The Pulsar Node.js client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Node.js. + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Node.js client are thread-safe. + +For 1.3.0 or later versions, [type definitions](https://github.com/apache/pulsar-client-node/blob/master/index.d.ts) used in TypeScript are available. + +## Installation + +You can install the [`pulsar-client`](https://www.npmjs.com/package/pulsar-client) library via [npm](https://www.npmjs.com/). + +### Requirements +Pulsar Node.js client library is based on the C++ client library. +Follow [these instructions](client-libraries-cpp.md#compilation) and install the Pulsar C++ client library. + +### Compatibility + +Compatibility between each version of the Node.js client and the C++ client is as follows: + +| Node.js client | C++ client | +| :------------- | :------------- | +| 1.0.0 | 2.3.0 or later | +| 1.1.0 | 2.4.0 or later | +| 1.2.0 | 2.5.0 or later | + +If an incompatible version of the C++ client is installed, you may fail to build or run this library. + +### Installation using npm + +Install the `pulsar-client` library via [npm](https://www.npmjs.com/): + +```shell + +$ npm install pulsar-client + +``` + +:::note + +Also, this library works only in Node.js 10.x or later because it uses the [`node-addon-api`](https://github.com/nodejs/node-addon-api) module to wrap the C++ library. + +::: + +## Connection URLs +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here is an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you are using [TLS encryption](security-tls-transport.md) or [TLS Authentication](security-tls-authentication.md), the URL looks like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you first need a client object. You can create a client instance using a `new` operator and the `Client` method, passing in a client options object (more on configuration [below](#client-configuration)). + +Here is an example: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + await client.close(); +})(); + +``` + +### Client configuration + +The following configurable parameters are available for Pulsar clients: + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `serviceUrl` | The connection URL for the Pulsar cluster. See [above](#connection-urls) for more info. | | +| `authentication` | Configure the authentication provider. (default: no authentication). See [TLS Authentication](security-tls-authentication.md) for more info. | | +| `operationTimeoutSeconds` | The timeout for Node.js client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries occur until this threshold is reached, at which point the operation fails. | 30 | +| `ioThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker). | 1 | +| `messageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)). | 1 | +| `concurrentLookupRequest` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 50000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 50000 | +| `tlsTrustCertsFilePath` | The file path for the trusted TLS certificate. | | +| `tlsValidateHostname` | The boolean value of setup whether to enable TLS hostname verification. | `false` | +| `tlsAllowInsecureConnection` | The boolean value of setup whether the Pulsar client accepts untrusted TLS certificate from broker. | `false` | +| `statsIntervalInSeconds` | Interval between each stat info. Stats is activated with positive statsInterval. The value should be set to 1 second at least | 600 | +| `log` | A function that is used for logging. | `console.log` | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Node.js producers using a producer configuration object. + +Here is an example: + +```JavaScript + +const producer = await client.createProducer({ + topic: 'my-topic', // or 'my-tenant/my-namespace/my-topic' to specify topic's tenant and namespace +}); + +await producer.send({ + data: Buffer.from("Hello, Pulsar"), +}); + +await producer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar producer, the operation returns `Promise` object and get producer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Producer operations + +Pulsar Node.js producers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `send(Object)` | Publishes a [message](#messages) to the producer's topic. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object whose result is the message ID runs executor function. | `Promise` | +| `flush()` | Sends message from send queue to Pulsar broker. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object runs executor function. | `Promise` | +| `close()` | Closes the producer and releases all resources allocated to it. Once `close()` is called, no more messages are accepted from the publisher. This method returns a Promise object. It runs the executor function when all pending publish requests are persisted by Pulsar. If an error is thrown, no pending writes are retried. | `Promise` | +| `getProducerName()` | Getter method of the producer name. | `string` | +| `getTopic()` | Getter method of the name of the topic. | `string` | + +### Producer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer publishes messages. The topic format is `` or `//`. For example, `sample/ns1/my-topic`. | | +| `producerName` | A name for the producer. If you do not explicitly assign a name, Pulsar automatically generates a globally unique name. If you choose to explicitly assign a name, it needs to be unique across *all* Pulsar clusters, otherwise the creation operation throws an error. | | +| `sendTimeoutMs` | When publishing a message to a topic, the producer waits for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error is thrown. If you set `sendTimeoutMs` to -1, the timeout is set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30000 | +| `initialSequenceId` | The initial sequence ID of the message. When producer send message, add sequence ID to message. The ID is increased each time to send. | | +| `maxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `send` method fails *unless* `blockIfQueueFull` is set to `true`. | 1000 | +| `maxPendingMessagesAcrossPartitions` | The maximum size of the sum of partition's pending queue. | 50000 | +| `blockIfQueueFull` | If set to `true`, the producer's `send` method waits when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `maxPendingMessages` parameter); if set to `false` (the default), `send` operations fails and throw a error when the queue is full. | `false` | +| `messageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-messaging.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`RoundRobinDistribution`), or publishing all messages to a single partition (`UseSinglePartition`, the default). | `UseSinglePartition` | +| `hashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `JavaStringHash` (the equivalent of `String.hashCode()` in Java), `Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library). | `BoostHash` | +| `compressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), and [`Zlib`](https://zlib.net/), [ZSTD](https://github.com/facebook/zstd/), [SNAPPY](https://github.com/google/snappy/). | Compression None | +| `batchingEnabled` | If set to `true`, the producer send message as batch. | `true` | +| `batchingMaxPublishDelayMs` | The maximum time of delay sending message in batching. | 10 | +| `batchingMaxMessages` | The maximum size of sending message in each time of batching. | 1000 | +| `properties` | The metadata of producer. | | + +### Producer example + +This example creates a Node.js producer for the `my-topic` topic and sends 10 messages to that topic: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'my-topic', + }); + + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); +})(); + +``` + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Node.js consumers using a consumer configuration object. + +Here is an example: + +```JavaScript + +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', +}); + +const msg = await consumer.receive(); +console.log(msg.getData().toString()); +consumer.acknowledge(msg); + +await consumer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar consumer, the operation returns `Promise` object and get consumer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Consumer operations + +Pulsar Node.js consumers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `receive()` | Receives a single message from the topic. When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `receive(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `acknowledge(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message object. | `void` | +| `acknowledgeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID object. | `void` | +| `acknowledgeCumulative(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `acknowledgeCumulative` method returns void, and send the ack to the broker asynchronously. After that, the messages are *not* redelivered to the consumer. Cumulative acking can not be used with a [shared](concepts-messaging.md#shared) subscription type. | `void` | +| `acknowledgeCumulativeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message ID. | `void` | +| `negativeAcknowledge(Message)`| [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message object. | `void` | +| `negativeAcknowledgeId(MessageId)` | [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message ID object. | `void` | +| `close()` | Closes the consumer, disabling its ability to receive messages from the broker. | `Promise` | +| `unsubscribe()` | Unsubscribes the subscription. | `Promise` | + +### Consumer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar topic on which the consumer establishes a subscription and listen for messages. | | +| `topics` | The array of topics. | | +| `topicsPattern` | The regular expression for topics. | | +| `subscription` | The subscription name for this consumer. | | +| `subscriptionType` | Available options are `Exclusive`, `Shared`, `Key_Shared`, and `Failover`. | `Exclusive` | +| `subscriptionInitialPosition` | Initial position at which to set cursor when subscribing to a topic at first time. | `SubscriptionInitialPosition.Latest` | +| `ackTimeoutMs` | Acknowledge timeout in milliseconds. | 0 | +| `nAckRedeliverTimeoutMs` | Delay to wait before redelivering messages that failed to be processed. | 60000 | +| `receiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 | +| `receiverQueueSizeAcrossPartitions` | Set the max total receiver queue size across partitions. This setting is used to reduce the receiver queue size for individual partitions if the total exceeds this value. | 50000 | +| `consumerName` | The name of consumer. Currently(v2.4.1), [failover](concepts-messaging.md#failover) mode use consumer name in ordering. | | +| `properties` | The metadata of consumer. | | +| `listener`| A listener that is called for a message received. | | +| `readCompacted`| If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | false | + +### Consumer example + +This example creates a Node.js consumer with the `my-subscription` subscription on the `my-topic` topic, receives messages, prints the content that arrive, and acknowledges each message to the Pulsar broker for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + }); + + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); +})(); + +``` + +Instead a consumer can be created with `listener` to process messages. + +```JavaScript + +// Create a consumer +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + listener: (msg, msgConsumer) => { + console.log(msg.getData().toString()); + msgConsumer.acknowledge(msg); + }, +}); + +``` + +:::note + +Pulsar Node.js client uses [AsyncWorker](https://github.com/nodejs/node-addon-api/blob/main/doc/async_worker). Asynchronous operations such as creating consumers/producers and receiving/sending messages are performed in worker threads. +Until completion of these operations, worker threads are blocked. +Since there are only 4 worker threads by default, a called method may never complete. +To avoid this situation, you can set `UV_THREADPOOL_SIZE` to increase the number of worker threads, or define `listener` instead of calling `receive()` many times. + +::: + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recently unacked message). You can [configure](#reader-configuration) Node.js readers using a reader configuration object. + +Here is an example: + +```JavaScript + +const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), +}); + +const msg = await reader.readNext(); +console.log(msg.getData().toString()); + +await reader.close(); + +``` + +### Reader operations + +Pulsar Node.js readers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `readNext()` | Receives the next message on the topic (analogous to the `receive` method for [consumers](#consumer-operations)). When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `readNext(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `hasNext()` | Return whether the broker has next message in target topic. | `Boolean` | +| `close()` | Closes the reader, disabling its ability to receive messages from the broker. | `Promise` | + +### Reader configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader establishes a subscription and listen for messages. | | +| `startMessageId` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `Pulsar.MessageId.earliest` (the earliest available message on the topic), `Pulsar.MessageId.latest` (the latest available message on the topic), or a message ID object for a position that is not earliest or latest. | | +| `receiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `readNext`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 | +| `readerName` | The name of the reader. | | +| `subscriptionRolePrefix` | The subscription role prefix. | | +| `readCompacted` | If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | `false` | + + +### Reader example + +This example creates a Node.js reader with the `my-topic` topic, reads messages, and prints the content that arrive for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a reader + const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), + }); + + // read messages + for (let i = 0; i < 10; i += 1) { + const msg = await reader.readNext(); + console.log(msg.getData().toString()); + } + + await reader.close(); + await client.close(); +})(); + +``` + +## Messages + +In Pulsar Node.js client, you have to construct producer message object for producer. + +Here is an example message: + +```JavaScript + +const msg = { + data: Buffer.from('Hello, Pulsar'), + partitionKey: 'key1', + properties: { + 'foo': 'bar', + }, + eventTimestamp: Date.now(), + replicationClusters: [ + 'cluster1', + 'cluster2', + ], +} + +await producer.send(msg); + +``` + +The following keys are available for producer message objects: + +| Parameter | Description | +| :-------- | :---------- | +| `data` | The actual data payload of the message. | +| `properties` | A Object for any application-specific metadata attached to the message. | +| `eventTimestamp` | The timestamp associated with the message. | +| `sequenceId` | The sequence ID of the message. | +| `partitionKey` | The optional key associated with the message (particularly useful for things like topic compaction). | +| `replicationClusters` | The clusters to which this message is replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. | +| `deliverAt` | The absolute timestamp at or after which the message is delivered. | | +| `deliverAfter` | The relative delay after which the message is delivered. | | + +### Message object operations + +In Pulsar Node.js client, you can receive (or read) message object as consumer (or reader). + +The message object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `getTopicName()` | Getter method of topic name. | `String` | +| `getProperties()` | Getter method of properties. | `Array` | +| `getData()` | Getter method of message data. | `Buffer` | +| `getMessageId()` | Getter method of [message id object](#message-id-object-operations). | `Object` | +| `getPublishTimestamp()` | Getter method of publish timestamp. | `Number` | +| `getEventTimestamp()` | Getter method of event timestamp. | `Number` | +| `getRedeliveryCount()` | Getter method of redelivery count. | `Number` | +| `getPartitionKey()` | Getter method of partition key. | `String` | + +### Message ID object operations + +In Pulsar Node.js client, you can get message id object from message object. + +The message id object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `serialize()` | Serialize the message id into a Buffer for storing. | `Buffer` | +| `toString()` | Get message id as String. | `String` | + +The client has static method of message id object. You can access it as `Pulsar.MessageId.someStaticMethod` too. + +The following static methods are available for the message id object: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `earliest()` | MessageId representing the earliest, or oldest available message stored in the topic. | `Object` | +| `latest()` | MessageId representing the latest, or last published message in the topic. | `Object` | +| `deserialize(Buffer)` | Deserialize a message id object from a Buffer. | `Object` | + +## End-to-end encryption + +[End-to-end encryption](cookbooks-encryption.md#docsNav) allows applications to encrypt messages at producers and decrypt at consumers. + +### Configuration + +If you want to use the end-to-end encryption feature in the Node.js client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Node.js client. + +**Prerequisite** + +- Pulsar C++ client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'persistent://public/default/my-topic', + sendTimeoutMs: 30000, + batchingEnabled: true, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem", + encryptionKey: "encryption-key" + }); + + console.log(producer.ProducerConfig) + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); + })(); + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://172.25.0.3:6650', + operationTimeoutSeconds: 30 + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'persistent://public/default/my-topic', + subscription: 'sub1', + subscriptionType: 'Shared', + ackTimeoutMs: 10000, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem" + }); + + console.log(consumer) + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); + })(); + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + node consumer.js + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + node producer.js + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + Sent message: my-message-0 + Sent message: my-message-1 + Sent message: my-message-2 + Sent message: my-message-3 + Sent message: my-message-4 + Sent message: my-message-5 + Sent message: my-message-6 + Sent message: my-message-7 + Sent message: my-message-8 + Sent message: my-message-9 + + ``` + + This is from the consumer side. + + ``` + + my-message-0 + my-message-1 + my-message-2 + my-message-3 + my-message-4 + my-message-5 + my-message-6 + my-message-7 + my-message-8 + my-message-9 + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-python.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-python.md new file mode 100644 index 0000000000000..1000023799044 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-python.md @@ -0,0 +1,641 @@ +--- +id: client-libraries-python +title: Pulsar Python client +sidebar_label: "Python" +original_id: client-libraries-python +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar Python client library is a wrapper over the existing [C++ client library](client-libraries-cpp.md) and exposes all of the [same features](/api/cpp). You can find the code in the [Python directory](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/python) of the C++ client code. + +All the methods in producer, consumer, and reader of a Python client are thread-safe. + +[pdoc](https://github.com/BurntSushi/pdoc)-generated API docs for the Python client are available [here](/api/python). + +## Install + +You can install the [`pulsar-client`](https://pypi.python.org/pypi/pulsar-client) library either via [PyPi](https://pypi.python.org/pypi), using [pip](#installation-using-pip), or by building the library from [source](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp). + +### Install using pip + +To install the `pulsar-client` library as a pre-built package using the [pip](https://pip.pypa.io/en/stable/) package manager: + +```shell + +$ pip install pulsar-client==@pulsar:version_number@ + +``` + +### Optional dependencies +If you install the client libraries on Linux to support services like Pulsar functions or Avro serialization, you can install optional components alongside the `pulsar-client` library. + +```shell + +# avro serialization +$ pip install pulsar-client[avro]=='@pulsar:version_number@' + +# functions runtime +$ pip install pulsar-client[functions]=='@pulsar:version_number@' + +# all optional components +$ pip install pulsar-client[all]=='@pulsar:version_number@' + +``` + +Installation via PyPi is available for the following Python versions: + +Platform | Supported Python versions +:--------|:------------------------- +MacOS
    10.13 (High Sierra), 10.14 (Mojave)
    | 2.7, 3.7, 3.8, 3.9 +Linux | 2.7, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9 + +### Install from source + +To install the `pulsar-client` library by building from source, follow [instructions](client-libraries-cpp.md#compilation) and compile the Pulsar C++ client library. That builds the Python binding for the library. + +To install the built Python bindings: + +```shell + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/pulsar-client-cpp/python +$ sudo python setup.py install + +``` + +## API Reference + +The complete Python API reference is available at [api/python](/api/python). + +## Examples + +You can find a variety of Python code examples for the [pulsar-client](/pulsar-client-cpp/python) library. + +### Producer example + +The following example creates a Python producer for the `my-topic` topic and sends 10 messages on that topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('Hello-%d' % i).encode('utf-8')) + +client.close() + +``` + +### Consumer example + +The following example creates a consumer with the `my-subscription` subscription name on the `my-topic` topic, receives incoming messages, prints the content and ID of messages that arrive, and acknowledges each message to the Pulsar broker. + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +consumer = client.subscribe('my-topic', 'my-subscription') + +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except Exception: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +client.close() + +``` + +This example shows how to configure negative acknowledgement. + +```python + +from pulsar import Client, schema +client = Client('pulsar://localhost:6650') +consumer = client.subscribe('negative_acks','test',schema=schema.StringSchema()) +producer = client.create_producer('negative_acks',schema=schema.StringSchema()) +for i in range(10): + print('send msg "hello-%d"' % i) + producer.send_async('hello-%d' % i, callback=None) +producer.flush() +for i in range(10): + msg = consumer.receive() + consumer.negative_acknowledge(msg) + print('receive and nack msg "%s"' % msg.data()) +for i in range(10): + msg = consumer.receive() + consumer.acknowledge(msg) + print('receive and ack msg "%s"' % msg.data()) +try: + # No more messages expected + msg = consumer.receive(100) +except: + print("no more msg") + pass + +``` + +### Reader interface example + +You can use the Pulsar Python API to use the Pulsar [reader interface](concepts-clients.md#reader-interface). Here's an example: + +```python + +# MessageId taken from a previously fetched message +msg_id = msg.message_id() + +reader = client.create_reader('my-topic', msg_id) + +while True: + msg = reader.read_next() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # No acknowledgment + +``` + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously. To use multi-topic subscriptions, you can supply a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The following is an example: + +```python + +import re +consumer = client.subscribe(re.compile('persistent://public/default/topic-*'), 'my-subscription') +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except Exception: + # Message failed to be processed + consumer.negative_acknowledge(msg) +client.close() + +``` + +## Schema + +### Supported schema types + +You can use different builtin schema types in Pulsar. All the definitions are in the `pulsar.schema` package. + +| Schema | Notes | +| ------ | ----- | +| `BytesSchema` | Get the raw payload as a `bytes` object. No serialization/deserialization are performed. This is the default schema mode | +| `StringSchema` | Encode/decode payload as a UTF-8 string. Uses `str` objects | +| `JsonSchema` | Require record definition. Serializes the record into standard JSON payload | +| `AvroSchema` | Require record definition. Serializes in AVRO format | + +### Schema definition reference + +The schema definition is done through a class that inherits from `pulsar.schema.Record`. + +This class has a number of fields which can be of either +`pulsar.schema.Field` type or another nested `Record`. All the +fields are specified in the `pulsar.schema` package. The fields +are matching the AVRO fields types. + +| Field Type | Python Type | Notes | +| ---------- | ----------- | ----- | +| `Boolean` | `bool` | | +| `Integer` | `int` | | +| `Long` | `int` | | +| `Float` | `float` | | +| `Double` | `float` | | +| `Bytes` | `bytes` | | +| `String` | `str` | | +| `Array` | `list` | Need to specify record type for items. | +| `Map` | `dict` | Key is always `String`. Need to specify value type. | + +Additionally, any Python `Enum` type can be used as a valid field type. + +#### Fields parameters + +When adding a field, you can use these parameters in the constructor. + +| Argument | Default | Notes | +| ---------- | --------| ----- | +| `default` | `None` | Set a default value for the field. Eg: `a = Integer(default=5)` | +| `required` | `False` | Mark the field as "required". It is set in the schema accordingly. | + +#### Schema definition examples + +##### Simple definition + +```python + +class Example(Record): + a = String() + b = Integer() + c = Array(String()) + i = Map(String()) + +``` + +##### Using enums + +```python + +from enum import Enum + +class Color(Enum): + red = 1 + green = 2 + blue = 3 + +class Example(Record): + name = String() + color = Color + +``` + +##### Complex types + +```python + +class MySubRecord(Record): + x = Integer() + y = Long() + z = String() + +class Example(Record): + a = String() + sub = MySubRecord() + +``` + +##### Set namespace for Avro schema + +Set the namespace for Avro Record schema using the special field `_avro_namespace`. + +```python + +class NamespaceDemo(Record): + _avro_namespace = 'xxx.xxx.xxx' + x = String() + y = Integer() + +``` + +The schema definition is like this. + +``` + +{ + 'name': 'NamespaceDemo', 'namespace': 'xxx.xxx.xxx', 'type': 'record', 'fields': [ + {'name': 'x', 'type': ['null', 'string']}, + {'name': 'y', 'type': ['null', 'int']} + ] +} + +``` + +### Declare and validate schema + +You can send messages using `BytesSchema`, `StringSchema`, `AvroSchema`, and `JsonSchema`. + +Before the producer is created, the Pulsar broker validates that the existing topic schema is the correct type and that the format is compatible with the schema definition of a class. If the format of the topic schema is incompatible with the schema definition, an exception occurs in the producer creation. + +Once a producer is created with a certain schema definition, it only accepts objects that are instances of the declared schema class. + +Similarly, for a consumer or reader, the consumer returns an object (which is an instance of the schema record class) rather than raw bytes. + +**Example** + +```python + +consumer = client.subscribe( + topic='my-topic', + subscription_name='my-subscription', + schema=AvroSchema(Example) ) + +while True: + msg = consumer.receive() + ex = msg.value() + try: + print("Received message a={} b={} c={}".format(ex.a, ex.b, ex.c)) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except Exception: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +``` + +````mdx-code-block + + + + +You can send byte data using a `BytesSchema`. + +**Example** + +```python + +producer = client.create_producer( + 'bytes-schema-topic', + schema=BytesSchema()) +producer.send(b"Hello") + +consumer = client.subscribe( + 'bytes-schema-topic', + 'sub', + schema=BytesSchema()) +msg = consumer.receive() +data = msg.value() + +``` + + + + +You can send string data using a `StringSchema`. + +**Example** + +```python + +producer = client.create_producer( + 'string-schema-topic', + schema=StringSchema()) +producer.send("Hello") + +consumer = client.subscribe( + 'string-schema-topic', + 'sub', + schema=StringSchema()) +msg = consumer.receive() +str = msg.value() + +``` + + + + +You can declare an `AvroSchema` using one of the following methods. + +#### Method 1: Record + +You can declare an `AvroSchema` by passing a class that inherits +from `pulsar.schema.Record` and defines the fields as +class variables. + +**Example** + +```python + +class Example(Record): + a = Integer() + b = Integer() + +producer = client.create_producer( + 'avro-schema-topic', + schema=AvroSchema(Example)) +r = Example(a=1, b=2) +producer.send(r) + +consumer = client.subscribe( + 'avro-schema-topic', + 'sub', + schema=AvroSchema(Example)) +msg = consumer.receive() +e = msg.value() + +``` + +#### Method 2: JSON definition + +You can declare an `AvroSchema` using JSON. In this case, Avro schemas are defined using JSON. + +**Example** + +Below is an `AvroSchema` defined using a JSON file (_company.avsc_). + +```json + +{ + "doc": "this is doc", + "namespace": "example.avro", + "type": "record", + "name": "Company", + "fields": [ + {"name": "name", "type": ["null", "string"]}, + {"name": "address", "type": ["null", "string"]}, + {"name": "employees", "type": ["null", {"type": "array", "items": { + "type": "record", + "name": "Employee", + "fields": [ + {"name": "name", "type": ["null", "string"]}, + {"name": "age", "type": ["null", "int"]} + ] + }}]}, + {"name": "labels", "type": ["null", {"type": "map", "values": "string"}]} + ] +} + +``` + +You can load a schema definition from file by using [`avro.schema`]((http://avro.apache.org/docs/current/gettingstartedpython.html) or [`fastavro.schema`](https://fastavro.readthedocs.io/en/latest/schema.html#fastavro._schema_py.load_schema). + +If you use the "JSON definition" method to declare an `AvroSchema`, pay attention to the following points: + +- You need to use [Python dict](https://developers.google.com/edu/python/dict-files) to produce and consume messages, which is different from using the "Record" method. + +- When generating an `AvroSchema` object, set `_record_cls` parameter to `None`. + +**Example** + +``` + +from fastavro.schema import load_schema +from pulsar.schema import * +schema_definition = load_schema("examples/company.avsc") +avro_schema = AvroSchema(None, schema_definition=schema_definition) +producer = client.create_producer( + topic=topic, + schema=avro_schema) +consumer = client.subscribe(topic, 'test', schema=avro_schema) +company = { + "name": "company-name" + str(i), + "address": 'xxx road xxx street ' + str(i), + "employees": [ + {"name": "user" + str(i), "age": 20 + i}, + {"name": "user" + str(i), "age": 30 + i}, + {"name": "user" + str(i), "age": 35 + i}, + ], + "labels": { + "industry": "software" + str(i), + "scale": ">100", + "funds": "1000000.0" + } +} +producer.send(company) +msg = consumer.receive() +# Users could get a dict object by `value()` method. +msg.value() + +``` + + + + +#### Record + +You can declare a `JsonSchema` by passing a class that inherits +from `pulsar.schema.Record` and defines the fields as class variables. This is similar to using `AvroSchema`. The only difference is to use `JsonSchema` instead of `AvroSchema` when defining schema type as shown below. For how to use `AvroSchema` via record, see [here](client-libraries-python.md#method-1-record). + +``` + +producer = client.create_producer( + 'avro-schema-topic', + schema=JsonSchema(Example)) + +consumer = client.subscribe( + 'avro-schema-topic', + 'sub', + schema=JsonSchema(Example)) + +``` + + + + +```` + +## End-to-end encryption + +[End-to-end encryption](cookbooks-encryption.md#docsNav) allows applications to encrypt messages at producers and decrypt messages at consumers. + +### Configuration + +To use the end-to-end encryption feature in the Python client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Python client. + +**Prerequisite** + +- Pulsar Python client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + producer = client.create_producer(topic='encryption', encryption_key='encryption', crypto_key_reader=crypto_key_reader) + producer.send('encryption message'.encode('utf8')) + print('sent message') + producer.close() + client.close() + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe(topic='encryption', subscription_name='encryption-sub', crypto_key_reader=crypto_key_reader) + msg = consumer.receive() + print("Received msg '{}' id = '{}'".format(msg.data(), msg.message_id())) + consumer.close() + client.close() + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + python consumer.py + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + python producer.py + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + sent message + + ``` + + This is from the consumer side. + + ``` + + Received msg 'encryption message' id = '(0,0,-1,-1)' + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-rest.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-rest.md new file mode 100644 index 0000000000000..1b26eedc01836 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-rest.md @@ -0,0 +1,134 @@ +--- +id: client-libraries-rest +title: Pulsar REST +sidebar_label: "REST" +original_id: client-libraries-rest +--- + +Pulsar not only provides REST endpoints to manage resources in Pulsar clusters, but also provides methods to query the state for those resources. In addition, Pulsar REST provides a simple way to interact with Pulsar **without using client libraries**, which is convenient for applications to use HTTP to interact with Pulsar. + +## Connection + +To connect to Pulsar, you need to specify a URL. + +- Produce messages to non-partitioned or partitioned topics + + ``` + + brokerUrl:{8080/8081}/topics/{persistent/non-persistent}/{my-tenant}/{my-namespace}/{my-topic} + + ``` + +- Produce messages to specific partitions of partitioned topics + + ``` + + brokerUrl:{8080/8081}/topics/{persistent/non-persistent}/{my-tenant}/{my-namespace}/{my-topic}/partitions/{partition-number} + + ``` + +## Producer + +Currently, you can produce messages to the following destinations with tools like cURL or Postman via REST. + +- Non-partitioned or partitioned topics + +- Specific partitions of partitioned topics + +:::note + +You can only produce messages to **topics that already exist** in Pulsar via REST. + +::: + +Consuming and reading messages via REST will be supported in the future. + +### Message + +- Below is the structure of a request payload. + + Parameter|Required?|Description + |---|---|--- + `schemaVersion`|No| Schema version of existing schema used for this message

    You need provide one of the following:

    - `schemaVersion`
    - `keySchema`/`valueSchema`

    If both of them are provided, then `schemaVersion` is used + `keySchema/valueSchema`|No|Key schema / Value schema used for this message + `producerName`|No|Producer name + `Messages[] SingleMessage`|Yes|Messages to be sent + +- Below is the structure of a message. + + Parameter|Required?|Type|Description + |---|---|---|--- + `payload`|Yes|`String`|Actual message payload

    Messages are sent in strings and encoded with given schemas on the server side + `properties`|No|`Map`|Custom properties + `key`|No|`String`|Partition key + `replicationClusters`|No|`List`|Clusters to which messages replicate + `eventTime`|No|`String`|Message event time + `sequenceId`|No|`long`|Message sequence ID + `disableReplication`|No|`boolean`|Whether to disable replication of messages + `deliverAt`|No|`long`|Deliver messages only at or after specified absolute timestamp + `deliverAfterMs`|No|`long`|Deliver messages only after specified relative delay (in milliseconds) + +### Schema + +- Currently, Primitive, Avro, JSON, and KeyValue schemas are supported. + +- For Primitive, Avro and JSON schemas, schemas should be provided as the full schema encoded as a string. + +- If the schema is not set, messages are encoded with string schema. + +### Example + +Below is an example of sending messages to topics using JSON schema via REST. + +Assume that you send messages representing the following class. + +```java + + class Seller { + public String state; + public String street; + public long zipCode; + } + + class PC { + public String brand; + public String model; + public int year; + public GPU gpu; + public Seller seller; + } + +``` + +Send messages to topics with JSON schema using the command below. + +```shell + +curl --location --request POST 'brokerUrl:{8080/8081}/topics/{persistent/non-persistent}/{my-tenant}/{my-namespace}/{my-topic}' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "valueSchema": "{\"name\":\"\",\"schema\":\"eyJ0eXBlIjoicmVjb3JkIiwibmFtZSI6IlBDIiwibmFtZXNwYWNlIjoib3JnLmFwYWNoZS5wdWxzYXIuYnJva2VyLmFkbWluLlRvcGljc1Rlc3QiLCJmaWVsZHMiOlt7Im5hbWUiOiJicmFuZCIsInR5cGUiOlsibnVsbCIsInN0cmluZyJdLCJkZWZhdWx0IjpudWxsfSx7Im5hbWUiOiJncHUiLCJ0eXBlIjpbIm51bGwiLHsidHlwZSI6ImVudW0iLCJuYW1lIjoiR1BVIiwic3ltYm9scyI6WyJBTUQiLCJOVklESUEiXX1dLCJkZWZhdWx0IjpudWxsfSx7Im5hbWUiOiJtb2RlbCIsInR5cGUiOlsibnVsbCIsInN0cmluZyJdLCJkZWZhdWx0IjpudWxsfSx7Im5hbWUiOiJzZWxsZXIiLCJ0eXBlIjpbIm51bGwiLHsidHlwZSI6InJlY29yZCIsIm5hbWUiOiJTZWxsZXIiLCJmaWVsZHMiOlt7Im5hbWUiOiJzdGF0ZSIsInR5cGUiOlsibnVsbCIsInN0cmluZyJdLCJkZWZhdWx0IjpudWxsfSx7Im5hbWUiOiJzdHJlZXQiLCJ0eXBlIjpbIm51bGwiLCJzdHJpbmciXSwiZGVmYXVsdCI6bnVsbH0seyJuYW1lIjoiemlwQ29kZSIsInR5cGUiOiJsb25nIn1dfV0sImRlZmF1bHQiOm51bGx9LHsibmFtZSI6InllYXIiLCJ0eXBlIjoiaW50In1dfQ==\",\"type\":\"JSON\",\"properties\":{\"__jsr310ConversionEnabled\":\"false\",\"__alwaysAllowNull\":\"true\"},\"schemaDefinition\":\"{\\\"type\\\":\\\"record\\\",\\\"name\\\":\\\"PC\\\",\\\"namespace\\\":\\\"org.apache.pulsar.broker.admin.TopicsTest\\\",\\\"fields\\\":[{\\\"name\\\":\\\"brand\\\",\\\"type\\\":[\\\"null\\\",\\\"string\\\"],\\\"default\\\":null},{\\\"name\\\":\\\"gpu\\\",\\\"type\\\":[\\\"null\\\",{\\\"type\\\":\\\"enum\\\",\\\"name\\\":\\\"GPU\\\",\\\"symbols\\\":[\\\"AMD\\\",\\\"NVIDIA\\\"]}],\\\"default\\\":null},{\\\"name\\\":\\\"model\\\",\\\"type\\\":[\\\"null\\\",\\\"string\\\"],\\\"default\\\":null},{\\\"name\\\":\\\"seller\\\",\\\"type\\\":[\\\"null\\\",{\\\"type\\\":\\\"record\\\",\\\"name\\\":\\\"Seller\\\",\\\"fields\\\":[{\\\"name\\\":\\\"state\\\",\\\"type\\\":[\\\"null\\\",\\\"string\\\"],\\\"default\\\":null},{\\\"name\\\":\\\"street\\\",\\\"type\\\":[\\\"null\\\",\\\"string\\\"],\\\"default\\\":null},{\\\"name\\\":\\\"zipCode\\\",\\\"type\\\":\\\"long\\\"}]}],\\\"default\\\":null},{\\\"name\\\":\\\"year\\\",\\\"type\\\":\\\"int\\\"}]}\"}", + +// Schema data is just the base 64 encoded schemaDefinition. + + "producerName": "rest-producer", + "messages": [ + { + "key":"my-key", + "payload":"{\"brand\":\"dell\",\"model\":\"alienware\",\"year\":2021,\"gpu\":\"AMD\",\"seller\":{\"state\":\"WA\",\"street\":\"main street\",\"zipCode\":98004}}", + "eventTime":1603045262772, + "sequenceId":1 + }, + { + "key":"my-key", + "payload":"{\"brand\":\"asus\",\"model\":\"rog\",\"year\":2020,\"gpu\":\"NVIDIA\",\"seller\":{\"state\":\"CA\",\"street\":\"back street\",\"zipCode\":90232}}", + "eventTime":1603045262772, + "sequenceId":2 + } + ] +} +` +// Sample message + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries-websocket.md b/site2/website/versioned_docs/version-2.10.x/client-libraries-websocket.md new file mode 100644 index 0000000000000..145866e41644b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries-websocket.md @@ -0,0 +1,664 @@ +--- +id: client-libraries-websocket +title: Pulsar WebSocket API +sidebar_label: "WebSocket" +original_id: client-libraries-websocket +--- + +Pulsar [WebSocket](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API) API provides a simple way to interact with Pulsar using languages that do not have an official [client library](getting-started-clients.md). Through WebSocket, you can publish and consume messages and use features available on the [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + + +> You can use Pulsar WebSocket API with any WebSocket client library. See examples for Python and Node.js [below](#client-examples). + +## Running the WebSocket service + +The standalone variant of Pulsar that we recommend using for [local development](getting-started-standalone.md) already has the WebSocket service enabled. + +In non-standalone mode, there are two ways to deploy the WebSocket service: + +* [embedded](#embedded-with-a-pulsar-broker) with a Pulsar broker +* as a [separate component](#as-a-separate-component) + +### Embedded with a Pulsar broker + +In this mode, the WebSocket service will run within the same HTTP service that's already running in the broker. To enable this mode, set the [`webSocketServiceEnabled`](reference-configuration.md#broker-webSocketServiceEnabled) parameter in the [`conf/broker.conf`](reference-configuration.md#broker) configuration file in your installation. + +```properties + +webSocketServiceEnabled=true + +``` + +### As a separate component + +In this mode, the WebSocket service will be run from a Pulsar [broker](reference-terminology.md#broker) as a separate service. Configuration for this mode is handled in the [`conf/websocket.conf`](reference-configuration.md#websocket) configuration file. You'll need to set *at least* the following parameters: + +* [`configurationMetadataStoreUrl`](reference-configuration.md#websocket) +* [`webServicePort`](reference-configuration.md#websocket-webServicePort) +* [`clusterName`](reference-configuration.md#websocket-clusterName) + +Here's an example: + +```properties + +configurationMetadataStoreUrl=zk1:2181,zk2:2181,zk3:2181 +webServicePort=8080 +clusterName=my-cluster + +``` + +### Security settings + +To enable TLS encryption on WebSocket service: + +```properties + +tlsEnabled=true +tlsAllowInsecureConnection=false +tlsCertificateFilePath=/path/to/client-websocket.cert.pem +tlsKeyFilePath=/path/to/client-websocket.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +### Starting the broker + +When the configuration is set, you can start the service using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) tool: + +```shell + +$ bin/pulsar-daemon start websocket + +``` + +## API Reference + +Pulsar's WebSocket API offers three endpoints for [producing](#producer-endpoint) messages, [consuming](#consumer-endpoint) messages and [reading](#reader-endpoint) messages. + +All exchanges via the WebSocket API use JSON. + +### Authentication + +#### Browser javascript WebSocket client + +Use the query param `token` transport the authentication token. + +```http + +ws://broker-service-url:8080/path?token=token + +``` + +### Producer endpoint + +The producer endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/producer/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`sendTimeoutMillis` | long | no | Send timeout (default: 30 secs) +`batchingEnabled` | boolean | no | Enable batching of messages (default: false) +`batchingMaxMessages` | int | no | Maximum number of messages permitted in a batch (default: 1000) +`maxPendingMessages` | int | no | Set the max size of the internal-queue holding the messages (default: 1000) +`batchingMaxPublishDelay` | long | no | Time period within which the messages will be batched (default: 10ms) +`messageRoutingMode` | string | no | Message [routing mode](/api/client/index.html?org/apache/pulsar/client/api/ProducerConfiguration.MessageRoutingMode.html) for the partitioned producer: `SinglePartition`, `RoundRobinPartition` +`compressionType` | string | no | Compression [type](/api/client/index.html?org/apache/pulsar/client/api/CompressionType.html): `LZ4`, `ZLIB` +`producerName` | string | no | Specify the name for the producer. Pulsar will enforce only one producer with same name can be publishing on a topic +`initialSequenceId` | long | no | Set the baseline for the sequence ids for messages published by the producer. +`hashingScheme` | string | no | [Hashing function](/api/client/org/apache/pulsar/client/api/ProducerConfiguration.HashingScheme.html) to use when publishing on a partitioned topic: `JavaStringHash`, `Murmur3_32Hash` +`token` | string | no | Authentication token, this is used for the browser javascript client + + +#### Publishing a message + +```json + +{ + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "context": "1" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`payload` | string | yes | Base-64 encoded payload +`properties` | key-value pairs | no | Application-defined properties +`context` | string | no | Application-defined request identifier +`key` | string | no | For partitioned topics, decides which partition to use +`replicationClusters` | array | no | Restrict replication to this list of [clusters](reference-terminology.md#cluster), specified by name + + +##### Example success response + +```json + +{ + "result": "ok", + "messageId": "CAAQAw==", + "context": "1" + } + +``` + +##### Example failure response + +```json + + { + "result": "send-error:3", + "errorMsg": "Failed to de-serialize from JSON", + "context": "1" + } + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`result` | string | yes | `ok` if successful or an error message if unsuccessful +`messageId` | string | yes | Message ID assigned to the published message +`context` | string | no | Application-defined request identifier + + +### Consumer endpoint + +The consumer endpoint requires you to specify a tenant, namespace, and topic, as well as a subscription, in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/consumer/persistent/:tenant/:namespace/:topic/:subscription + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`ackTimeoutMillis` | long | no | Set the timeout for unacked messages (default: 0) +`subscriptionType` | string | no | [Subscription type](/api/client/index.html?org/apache/pulsar/client/api/SubscriptionType.html): `Exclusive`, `Failover`, `Shared`, `Key_Shared` +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`consumerName` | string | no | Consumer name +`priorityLevel` | int | no | Define a [priority](/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setPriorityLevel-int-) for the consumer +`maxRedeliverCount` | int | no | Define a [maxRedeliverCount](/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: 0). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`deadLetterTopic` | string | no | Define a [deadLetterTopic](/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: {topic}-{subscription}-DLQ). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`pullMode` | boolean | no | Enable pull mode (default: false). See "Flow Control" below. +`negativeAckRedeliveryDelay` | int | no | When a message is negatively acknowledged, the delay time before the message is redelivered (in milliseconds). The default value is 60000. +`token` | string | no | Authentication token, this is used for the browser javascript client + +NB: these parameter (except `pullMode`) apply to the internal consumer of the WebSocket service. +So messages will be subject to the redelivery settings as soon as the get into the receive queue, +even if the client doesn't consume on the WebSocket. + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ + "messageId": "CAMQADAA", + "payload": "hvXcJvHW7kOSrUn17P2q71RA5SdiXwZBqw==", + "properties": {}, + "publishTime": "2021-10-29T16:01:38.967-07:00", + "redeliveryCount": 0, + "encryptionContext": { + "keys": { + "client-rsa.pem": { + "keyValue": "jEuwS+PeUzmCo7IfLNxqoj4h7txbLjCQjkwpaw5AWJfZ2xoIdMkOuWDkOsqgFmWwxiecakS6GOZHs94x3sxzKHQx9Oe1jpwBg2e7L4fd26pp+WmAiLm/ArZJo6JotTeFSvKO3u/yQtGTZojDDQxiqFOQ1ZbMdtMZA8DpSMuq+Zx7PqLo43UdW1+krjQfE5WD+y+qE3LJQfwyVDnXxoRtqWLpVsAROlN2LxaMbaftv5HckoejJoB4xpf/dPOUqhnRstwQHf6klKT5iNhjsY4usACt78uILT0pEPd14h8wEBidBz/vAlC/zVMEqiDVzgNS7dqEYS4iHbf7cnWVCn3Hxw==", + "metadata": {} + } + }, + "param": "Tfu1PxVm6S9D3+Hk", + "compressionType": "NONE", + "uncompressedMessageSize": 0, + "batchSize": { + "empty": false, + "present": true + } + } +} + +``` + +Below are the parameters in the WebSocket consumer response. + +- General parameters + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `messageId` | string | yes | Message ID + `payload` | string | yes | Base-64 encoded payload + `publishTime` | string | yes | Publish timestamp + `redeliveryCount` | number | yes | Number of times this message was already delivered + `properties` | key-value pairs | no | Application-defined properties + `key` | string | no | Original routing key set by producer + `encryptionContext` | EncryptionContext | no | Encryption context that consumers can use to decrypt received messages + `param` | string | no | Initialization vector for cipher (Base64 encoding) + `batchSize` | string | no | Number of entries in a message (if it is a batch message) + `uncompressedMessageSize` | string | no | Message size before compression + `compressionType` | string | no | Algorithm used to compress the message payload + +- `encryptionContext` related parameter + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `keys` |key-EncryptionKey pairs | yes | Key in `key-EncryptionKey` pairs is an encryption key name. Value in `key-EncryptionKey` pairs is an encryption key object. + +- `encryptionKey` related parameters + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `keyValue` | string | yes | Encryption key (Base64 encoding) + `metadata` | key-value pairs | no | Application-defined metadata + +#### Acknowledging the message + +Consumer needs to acknowledge the successful processing of the message to +have the Pulsar broker delete it. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Negatively acknowledging messages + +```json + +{ + "type": "negativeAcknowledge", + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Flow control + +##### Push Mode + +By default (`pullMode=false`), the consumer endpoint will use the `receiverQueueSize` parameter both to size its +internal receive queue and to limit the number of unacknowledged messages that are passed to the WebSocket client. +In this mode, if you don't send acknowledgements, the Pulsar WebSocket service will stop sending messages after reaching +`receiverQueueSize` unacked messages sent to the WebSocket client. + +##### Pull Mode + +If you set `pullMode` to `true`, the WebSocket client will need to send `permit` commands to permit the +Pulsar WebSocket service to send more messages. + +```json + +{ + "type": "permit", + "permitMessages": 100 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `permit` +`permitMessages`| int | yes | Number of messages to permit + +NB: in this mode it's possible to acknowledge messages in a different connection. + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Reader endpoint + +The reader endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/reader/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`readerName` | string | no | Reader name +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`messageId` | int or enum | no | Message ID to start from, `earliest` or `latest` (default: `latest`) +`token` | string | no | Authentication token, this is used for the browser javascript client + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785", + "redeliveryCount": 4 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`redeliveryCount` | number | yes | Number of times this message was already delivered +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +**In WebSocket**, Reader needs to acknowledge the successful processing of the message to +have the Pulsar WebSocket service update the number of pending messages. +If you don't send acknowledgements, Pulsar WebSocket service will stop sending messages after reaching the pendingMessages limit. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Error codes + +In case of error the server will close the WebSocket session using the +following error codes: + +Error Code | Error Message +:----------|:------------- +1 | Failed to create producer +2 | Failed to subscribe +3 | Failed to deserialize from JSON +4 | Failed to serialize to JSON +5 | Failed to authenticate client +6 | Client is not authorized +7 | Invalid payload encoding +8 | Unknown error + +> The application is responsible for re-establishing a new WebSocket session after a backoff period. + +## Client examples + +Below you'll find code examples for the Pulsar WebSocket API in [Python](#python) and [Node.js](#nodejs). + +### Python + +This example uses the [`websocket-client`](https://pypi.python.org/pypi/websocket-client) package. You can install it using [pip](https://pypi.python.org/pypi/pip): + +```shell + +$ pip install websocket-client + +``` + +You can also download it from [PyPI](https://pypi.python.org/pypi/websocket-client). + +#### Python producer + +Here's an example Python producer that sends a simple message to a Pulsar [topic](reference-terminology.md#topic): + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/producer/persistent/public/default/my-topic' + +ws = websocket.create_connection(TOPIC) + +# encode message +s = "Hello World" +firstEncoded = s.encode("UTF-8") +binaryEncoded = base64.b64encode(firstEncoded) +payloadString = binaryEncoded.decode('UTF-8') + +# Send one message as JSON +ws.send(json.dumps({ + 'payload' : payloadString, + 'properties': { + 'key1' : 'value1', + 'key2' : 'value2' + }, + 'context' : 5 +})) + +response = json.loads(ws.recv()) +if response['result'] == 'ok': + print( 'Message published successfully') +else: + print('Failed to publish message:', response) +ws.close() + +``` + +#### Python consumer + +Here's an example Python consumer that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub' + +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print( "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload']))) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +#### Python reader + +Here's an example Python reader that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/reader/persistent/public/default/my-topic' +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print ( "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload']))) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +### Node.js + +This example uses the [`ws`](https://websockets.github.io/ws/) package. You can install it using [npm](https://www.npmjs.com/): + +```shell + +$ npm install ws + +``` + +#### Node.js producer + +Here's an example Node.js producer that sends a simple message to a Pulsar topic: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/producer/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +var message = { + "payload" : new Buffer("Hello World").toString('base64'), + "properties": { + "key1" : "value1", + "key2" : "value2" + }, + "context" : "1" +}; + +ws.on('open', function() { + // Send one message + ws.send(JSON.stringify(message)); +}); + +ws.on('message', function(message) { + console.log('received ack: %s', message); +}); + +``` + +#### Node.js consumer + +Here's an example Node.js consumer that listens on the same topic used by the producer above: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + +#### NodeJS reader + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/reader/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/client-libraries.md b/site2/website/versioned_docs/version-2.10.x/client-libraries.md new file mode 100644 index 0000000000000..6cdc1e615c81f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/client-libraries.md @@ -0,0 +1,45 @@ +--- +id: client-libraries +title: Pulsar client libraries +sidebar_label: "Overview" +original_id: client-libraries +--- + +Pulsar supports the following client libraries: + +|Language|Documentation|Release note|Code repo +|---|---|---|--- +Java |- [User doc](client-libraries-java.md)

    - [API doc](/api/client/)|[Here](/release-notes/)|[Here](https://github.com/apache/pulsar/tree/master/pulsar-client) +C++ | - [User doc](client-libraries-cpp.md)

    - [API doc](/api/cpp/)|[Here](/release-notes/)|[Here](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp) +Python | - [User doc](client-libraries-python.md)

    - [API doc](/api/python/)|[Here](/release-notes/)|[Here](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/python) +WebSocket| [User doc](client-libraries-websocket.md) | [Here](/release-notes/)|[Here](https://github.com/apache/pulsar/tree/master/pulsar-websocket) +Go client|[User doc](client-libraries-go.md)|[Here](https://github.com/apache/pulsar-client-go/blob/master/CHANGELOG) |[Here](https://github.com/apache/pulsar-client-go) +Node.js|[User doc](client-libraries-node.md)|[Here](https://github.com/apache/pulsar-client-node/releases) |[Here](https://github.com/apache/pulsar-client-node) +C# |[User doc](client-libraries-dotnet.md)| [Here](https://github.com/apache/pulsar-dotpulsar/blob/master/CHANGELOG)|[Here](https://github.com/apache/pulsar-dotpulsar) + +:::note + +- The code repos of **Java, C++, Python,** and **WebSocket** clients are hosted in the [Pulsar main repo](https://github.com/apache/pulsar) and these clients are released with Pulsar, so their release notes are parts of [Pulsar release note](/release-notes/). +- The code repos of **Go, Node.js,** and **C#** clients are hosted outside of the [Pulsar main repo](https://github.com/apache/pulsar) and these clients are not released with Pulsar, so they have independent release notes. + +::: + +## Feature matrix +Pulsar client feature matrix for different languages is listed on [Pulsar Feature Matrix (Client and Function)](https://docs.google.com/spreadsheets/d/1YHYTkIXR8-Ql103u-IMI18TXLlGStK8uJjDsOOA0T20/edit#gid=1784579914) page. + +## Third-party clients + +Besides the official released clients, multiple projects on developing Pulsar clients are available in different languages. + +> If you have developed a new Pulsar client, feel free to submit a pull request and add your client to the list below. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| Go | [pulsar-client-go](https://github.com/Comcast/pulsar-client-go) | [Comcast](https://github.com/Comcast) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | +| Go | [go-pulsar](https://github.com/t2y/go-pulsar) | [t2y](https://github.com/t2y) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | +| Haskell | [supernova](https://github.com/cr-org/supernova) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Native Pulsar client for Haskell | +| Scala | [neutron](https://github.com/cr-org/neutron) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Purely functional Apache Pulsar client for Scala built on top of Fs2 | +| Scala | [pulsar4s](https://github.com/sksamuel/pulsar4s) | [sksamuel](https://github.com/sksamuel) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Idomatic, typesafe, and reactive Scala client for Apache Pulsar | +| Rust | [pulsar-rs](https://github.com/wyyerd/pulsar-rs) | [Wyyerd Group](https://github.com/wyyerd) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Future-based Rust bindings for Apache Pulsar | +| .NET | [pulsar-client-dotnet](https://github.com/fsharplang-ru/pulsar-client-dotnet) | [Lanayx](https://github.com/Lanayx) | [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native .NET client for C#/F#/VB | +| Node.js | [pulsar-flex](https://github.com/ayeo-flex-org/pulsar-flex) | [Daniel Sinai](https://github.com/danielsinai), [Ron Farkash](https://github.com/ronfarkash), [Gal Rosenberg](https://github.com/galrose)| [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native Nodejs client | diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-architecture-overview.md b/site2/website/versioned_docs/version-2.10.x/concepts-architecture-overview.md new file mode 100644 index 0000000000000..5f2fb2ea99167 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-architecture-overview.md @@ -0,0 +1,176 @@ +--- +id: concepts-architecture-overview +title: Architecture Overview +sidebar_label: "Architecture" +original_id: concepts-architecture-overview +--- + +At the highest level, a Pulsar instance is composed of one or more Pulsar clusters. Clusters within an instance can [replicate](concepts-replication.md) data amongst themselves. + +In a Pulsar cluster: + +* One or more brokers handles and load balances incoming messages from producers, dispatches messages to consumers, communicates with the Pulsar configuration store to handle various coordination tasks, stores messages in BookKeeper instances (aka bookies), relies on a cluster-specific ZooKeeper cluster for certain tasks, and more. +* A BookKeeper cluster consisting of one or more bookies handles [persistent storage](#persistent-storage) of messages. +* A ZooKeeper cluster specific to that cluster handles coordination tasks between Pulsar clusters. + +The diagram below provides an illustration of a Pulsar cluster: + +![Pulsar architecture diagram](/assets/pulsar-system-architecture.png) + +At the broader instance level, an instance-wide ZooKeeper cluster called the configuration store handles coordination tasks involving multiple clusters, for example [geo-replication](concepts-replication.md). + +## Brokers + +The Pulsar message broker is a stateless component that's primarily responsible for running two other components: + +* An HTTP server that exposes a {@inject: rest:REST:/} API for both administrative tasks and [topic lookup](concepts-clients.md#client-setup-phase) for producers and consumers. The producers connect to the brokers to publish messages and the consumers connect to the brokers to consume the messages. +* A dispatcher, which is an asynchronous TCP server over a custom [binary protocol](developing-binary-protocol.md) used for all data transfers + +Messages are typically dispatched out of a [managed ledger](#managed-ledgers) cache for the sake of performance, *unless* the backlog exceeds the cache size. If the backlog grows too large for the cache, the broker will start reading entries from BookKeeper. + +Finally, to support geo-replication on global topics, the broker manages replicators that tail the entries published in the local region and republish them to the remote region using the Pulsar [Java client library](client-libraries-java.md). + +> For a guide to managing Pulsar brokers, see the [brokers](admin-api-brokers.md) guide. + +## Clusters + +A Pulsar instance consists of one or more Pulsar *clusters*. Clusters, in turn, consist of: + +* One or more Pulsar [brokers](#brokers) +* A ZooKeeper quorum used for cluster-level configuration and coordination +* An ensemble of bookies used for [persistent storage](#persistent-storage) of messages + +Clusters can replicate amongst themselves using [geo-replication](concepts-replication.md). + +> For a guide to managing Pulsar clusters, see the [clusters](admin-api-clusters.md) guide. + +## Metadata store + +The Pulsar metadata store maintains all the metadata of a Pulsar cluster, such as topic metadata, schema, broker load data, and so on. Pulsar uses [Apache ZooKeeper](https://zookeeper.apache.org/) for metadata storage, cluster configuration, and coordination. The Pulsar metadata store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. You can use one ZooKeeper cluster for both Pulsar metadata store and BookKeeper metadata store. If you want to deploy Pulsar brokers connected to an existing BookKeeper cluster, you need to deploy separate ZooKeeper clusters for Pulsar metadata store and BookKeeper metadata store respectively. + +> Pulsar also supports more metadata backend services, including [ETCD](https://etcd.io/) and [RocksDB](http://rocksdb.org/) (for standalone Pulsar only). + + +In a Pulsar instance: + +* A configuration store quorum stores configuration for tenants, namespaces, and other entities that need to be globally consistent. +* Each cluster has its own local ZooKeeper ensemble that stores cluster-specific configuration and coordination such as which brokers are responsible for which topics as well as ownership metadata, broker load reports, BookKeeper ledger metadata, and more. + +## Configuration store + +The configuration store maintains all the configurations of a Pulsar instance, such as clusters, tenants, namespaces, partitioned topic related configurations, and so on. A Pulsar instance can have a single local cluster, multiple local clusters, or multiple cross-region clusters. Consequently, the configuration store can share the configurations across multiple clusters under a Pulsar instance. The configuration store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. + +## Persistent storage + +Pulsar provides guaranteed message delivery for applications. If a message successfully reaches a Pulsar broker, it will be delivered to its intended target. + +This guarantee requires that non-acknowledged messages are stored in a durable manner until they can be delivered to and acknowledged by consumers. This mode of messaging is commonly called *persistent messaging*. In Pulsar, N copies of all messages are stored and synced on disk, for example 4 copies across two servers with mirrored [RAID](https://en.wikipedia.org/wiki/RAID) volumes on each server. + +### Apache BookKeeper + +Pulsar uses a system called [Apache BookKeeper](http://bookkeeper.apache.org/) for persistent message storage. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) (WAL) system that provides a number of crucial advantages for Pulsar: + +* It enables Pulsar to utilize many independent logs, called [ledgers](#ledgers). Multiple ledgers can be created for topics over time. +* It offers very efficient storage for sequential data that handles entry replication. +* It guarantees read consistency of ledgers in the presence of various system failures. +* It offers even distribution of I/O across bookies. +* It's horizontally scalable in both capacity and throughput. Capacity can be immediately increased by adding more bookies to a cluster. +* Bookies are designed to handle thousands of ledgers with concurrent reads and writes. By using multiple disk devices---one for journal and another for general storage--bookies are able to isolate the effects of read operations from the latency of ongoing write operations. + +In addition to message data, *cursors* are also persistently stored in BookKeeper. Cursors are [subscription](reference-terminology.md#subscription) positions for [consumers](reference-terminology.md#consumer). BookKeeper enables Pulsar to store consumer position in a scalable fashion. + +At the moment, Pulsar supports persistent message storage. This accounts for the `persistent` in all topic names. Here's an example: + +```http + +persistent://my-tenant/my-namespace/my-topic + +``` + +> Pulsar also supports ephemeral ([non-persistent](concepts-messaging.md#non-persistent-topics)) message storage. + + +You can see an illustration of how brokers and bookies interact in the diagram below: + +![Brokers and bookies](/assets/broker-bookie.png) + + +### Ledgers + +A ledger is an append-only data structure with a single writer that is assigned to multiple BookKeeper storage nodes, or bookies. Ledger entries are replicated to multiple bookies. Ledgers themselves have very simple semantics: + +* A Pulsar broker can create a ledger, append entries to the ledger, and close the ledger. +* After the ledger has been closed---either explicitly or because the writer process crashed---it can then be opened only in read-only mode. +* Finally, when entries in the ledger are no longer needed, the whole ledger can be deleted from the system (across all bookies). + +#### Ledger read consistency + +The main strength of Bookkeeper is that it guarantees read consistency in ledgers in the presence of failures. Since the ledger can only be written to by a single process, that process is free to append entries very efficiently, without need to obtain consensus. After a failure, the ledger will go through a recovery process that will finalize the state of the ledger and establish which entry was last committed to the log. After that point, all readers of the ledger are guaranteed to see the exact same content. + +#### Managed ledgers + +Given that Bookkeeper ledgers provide a single log abstraction, a library was developed on top of the ledger called the *managed ledger* that represents the storage layer for a single topic. A managed ledger represents the abstraction of a stream of messages with a single writer that keeps appending at the end of the stream and multiple cursors that are consuming the stream, each with its own associated position. + +Internally, a single managed ledger uses multiple BookKeeper ledgers to store the data. There are two reasons to have multiple ledgers: + +1. After a failure, a ledger is no longer writable and a new one needs to be created. +2. A ledger can be deleted when all cursors have consumed the messages it contains. This allows for periodic rollover of ledgers. + +### Journal storage + +In BookKeeper, *journal* files contain BookKeeper transaction logs. Before making an update to a [ledger](#ledgers), a bookie needs to ensure that a transaction describing the update is written to persistent (non-volatile) storage. A new journal file is created once the bookie starts or the older journal file reaches the journal file size threshold (configured using the [`journalMaxSizeMB`](reference-configuration.md#bookkeeper-journalMaxSizeMB) parameter). + +## Pulsar proxy + +One way for Pulsar clients to interact with a Pulsar [cluster](#clusters) is by connecting to Pulsar message [brokers](#brokers) directly. In some cases, however, this kind of direct connection is either infeasible or undesirable because the client doesn't have direct access to broker addresses. If you're running Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, for example, then direct client connections to brokers are likely not possible. + +The **Pulsar proxy** provides a solution to this problem by acting as a single gateway for all of the brokers in a cluster. If you run the Pulsar proxy (which, again, is optional), all client connections with the Pulsar cluster will flow through the proxy rather than communicating with brokers. + +> For the sake of performance and fault tolerance, you can run as many instances of the Pulsar proxy as you'd like. + +Architecturally, the Pulsar proxy gets all the information it requires from ZooKeeper. When starting the proxy on a machine, you only need to provide metadata store connection strings for the cluster-specific and instance-wide configuration store clusters. Here's an example: + +```bash + +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy \ + --metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 \ + --configuration-metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 + +``` + +> #### Pulsar proxy docs +> For documentation on using the Pulsar proxy, see the [Pulsar proxy admin documentation](administration-proxy.md). + + +Some important things to know about the Pulsar proxy: + +* Connecting clients don't need to provide *any* specific configuration to use the Pulsar proxy. You won't need to update the client configuration for existing applications beyond updating the IP used for the service URL (for example if you're running a load balancer over the Pulsar proxy). +* [TLS encryption](security-tls-transport.md) and [authentication](security-tls-authentication.md) is supported by the Pulsar proxy + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. + +You can use your own service discovery system if you'd like. If you use your own system, there is just one requirement: when a client performs an HTTP request to an endpoint, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +The diagram below illustrates Pulsar service discovery: + +![alt-text](/assets/pulsar-service-discovery.png) + +In this diagram, the Pulsar cluster is addressable via a single DNS name: `pulsar-cluster.acme.com`. A [Python client](client-libraries-python.md), for example, could access this Pulsar cluster like this: + +```python + +from pulsar import Client + +client = Client('pulsar://pulsar-cluster.acme.com:6650') + +``` + +:::note + +In Pulsar, each topic is handled by only one broker. Initial requests from a client to read, update or delete a topic are sent to a broker that may not be the topic owner. If the broker cannot handle the request for this topic, it redirects the request to the appropriate broker. + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-authentication.md b/site2/website/versioned_docs/version-2.10.x/concepts-authentication.md new file mode 100644 index 0000000000000..f6307890c904a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-authentication.md @@ -0,0 +1,9 @@ +--- +id: concepts-authentication +title: Authentication and Authorization +sidebar_label: "Authentication and Authorization" +original_id: concepts-authentication +--- + +Pulsar supports a pluggable [authentication](security-overview.md) mechanism which can be configured at the proxy and/or the broker. Pulsar also supports a pluggable [authorization](security-authorization.md) mechanism. These mechanisms work together to identify the client and its access rights on topics, namespaces and tenants. + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-clients.md b/site2/website/versioned_docs/version-2.10.x/concepts-clients.md new file mode 100644 index 0000000000000..4040624f7d636 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-clients.md @@ -0,0 +1,92 @@ +--- +id: concepts-clients +title: Pulsar Clients +sidebar_label: "Clients" +original_id: concepts-clients +--- + +Pulsar exposes a client API with language bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md), [C++](client-libraries-cpp.md) and [C#](client-libraries-dotnet.md). The client API optimizes and encapsulates Pulsar's client-broker communication protocol and exposes a simple and intuitive API for use by applications. + +Under the hood, the current official Pulsar client libraries support transparent reconnection and/or connection failover to brokers, queuing of messages until acknowledged by the broker, and heuristics such as connection retries with backoff. + +> **Custom client libraries** +> If you'd like to create your own client library, we recommend consulting the documentation on Pulsar's custom [binary protocol](developing-binary-protocol.md). + + +## Client setup phase + +Before an application creates a producer/consumer, the Pulsar client library needs to initiate a setup phase including two steps: + +1. The client attempts to determine the owner of the topic by sending an HTTP lookup request to the broker. The request could reach one of the active brokers which, by looking at the (cached) zookeeper metadata knows who is serving the topic or, in case nobody is serving it, tries to assign it to the least loaded broker. +1. Once the client library has the broker address, it creates a TCP connection (or reuse an existing connection from the pool) and authenticates it. Within this connection, client and broker exchange binary commands from a custom protocol. At this point the client sends a command to create producer/consumer to the broker, which will comply after having validated the authorization policy. + +Whenever the TCP connection breaks, the client immediately re-initiates this setup phase and keeps trying with exponential backoff to re-establish the producer or consumer until the operation succeeds. + +## Reader interface + +In Pulsar, the "standard" [consumer interface](concepts-messaging.md#consumers) involves using consumers to listen on [topics](reference-terminology.md#topic), process incoming messages, and finally acknowledge those messages when they are processed. Whenever a new subscription is created, it is initially positioned at the end of the topic (by default), and consumers associated with that subscription begin reading with the first message created afterwards. Whenever a consumer connects to a topic using a pre-existing subscription, it begins reading from the earliest message un-acked within that subscription. In summary, with the consumer interface, subscription cursors are automatically managed by Pulsar in response to [message acknowledgements](concepts-messaging.md#acknowledgement). + +The **reader interface** for Pulsar enables applications to manually manage cursors. When you use a reader to connect to a topic---rather than a consumer---you need to specify *which* message the reader begins reading from when it connects to a topic. When connecting to a topic, the reader interface enables you to begin with: + +* The **earliest** available message in the topic +* The **latest** available message in the topic +* Some other message between the earliest and the latest. If you select this option, you'll need to explicitly provide a message ID. Your application will be responsible for "knowing" this message ID in advance, perhaps fetching it from a persistent data store or cache. + +The reader interface is helpful for use cases like using Pulsar to provide effectively-once processing semantics for a stream processing system. For this use case, it's essential that the stream processing system be able to "rewind" topics to a specific message and begin reading there. The reader interface provides Pulsar clients with the low-level abstraction necessary to "manually position" themselves within a topic. + +Internally, the reader interface is implemented as a consumer using an exclusive, non-durable subscription to the topic with a randomly-allocated name. + +[ **IMPORTANT** ] + +Unlike subscription/consumer, readers are non-durable in nature and does not prevent data in a topic from being deleted, thus it is ***strongly*** advised that [data retention](cookbooks-retention-expiry.md) be configured. If data retention for a topic is not configured for an adequate amount of time, messages that the reader has not yet read might be deleted . This causes the readers to essentially skip messages. Configuring the data retention for a topic guarantees the reader with a certain duration to read a message. + +Please also note that a reader can have a "backlog", but the metric is only used for users to know how behind the reader is. The metric is not considered for any backlog quota calculations. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-reader-consumer-interfaces.png) + +Here's a Java example that begins reading from the earliest available message on a topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageId; +import org.apache.pulsar.client.api.Reader; + +// Create a reader on a topic and for a specific message (and onward) +Reader reader = pulsarClient.newReader() + .topic("reader-api-test") + .startMessageId(MessageId.earliest) + .create(); + +while (true) { + Message message = reader.readNext(); + + // Process the message +} + +``` + +To create a reader that reads from the latest available message: + +```java + +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.latest) + .create(); + +``` + +To create a reader that reads from some message between the earliest and the latest: + +```java + +byte[] msgIdBytes = // Some byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-messaging.md b/site2/website/versioned_docs/version-2.10.x/concepts-messaging.md new file mode 100644 index 0000000000000..b1d1483232ebd --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-messaging.md @@ -0,0 +1,989 @@ +--- +id: concepts-messaging +title: Messaging +sidebar_label: "Messaging" +original_id: concepts-messaging +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern (often abbreviated to pub-sub). In this pattern, [producers](#producers) publish messages to [topics](#topics); [consumers](#consumers) [subscribe](#subscription-types) to those topics, process incoming messages, and send [acknowledgements](#acknowledgement) to the broker when processing is finished. + +When a subscription is created, Pulsar [retains](concepts-architecture-overview.md#persistent-storage) all messages, even if the consumer is disconnected. The retained messages are discarded only when a consumer acknowledges that all these messages are processed successfully. + +If the consumption of a message fails and you want this message to be consumed again, you can enable [message redelivery mechanism](#message-redelivery) to request the broker to resend this message. + +## Messages + +Messages are the basic "unit" of Pulsar. The following table lists the components of messages. + +Component | Description +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages contain raw bytes, although message data can also conform to data [schemas](schema-get-started.md). +Key | Messages are optionally tagged with keys, which is useful for things like [topic compaction](concepts-topic-compaction.md). +Properties | An optional key/value map of user-defined properties. +Producer name | The name of the producer who produces the message. If you do not specify a producer name, the default name is used. +Topic name | The name of the topic that the message is published to. +Schema version | The version number of the schema that the message is produced with. +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. The sequence ID of a message is initially assigned by its producer, indicating its order in that sequence, and can also be customized.
    Sequence ID can be used for message deduplication. If `brokerDeduplicationEnabled` is set to `true`, the sequence ID of each message is unique within a producer of a topic (non-partitioned) or a partition. +Message ID | The message ID of a message is assigned by bookies as soon as the message is persistently stored. Message ID indicates a message’s specific position in a ledger and is unique within a Pulsar cluster. +Publish time | The timestamp of when the message is published. The timestamp is automatically applied by the producer. +Event time | An optional timestamp attached to a message by applications. For example, applications attach a timestamp on when the message is processed. If nothing is set to event time, the value is `0`. + +The default size of a message is 5 MB. You can configure the max size of a message with the following configurations. + +- In the `broker.conf` file. + + ```bash + + # The max size of a message (in bytes). + maxMessageSize=5242880 + + ``` + +- In the `bookkeeper.conf` file. + + ```bash + + # The max size of the netty frame (in bytes). Any messages received larger than this value are rejected. The default value is 5 MB. + nettyMaxFrameSizeBytes=5253120 + + ``` + +> For more information on Pulsar messages, see Pulsar [binary protocol](developing-binary-protocol.md). + +## Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar [broker](reference-terminology.md#broker). The Pulsar broker processes the messages. + +### Send modes + +Producers send messages to brokers synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|-----------| +| Sync send | The producer waits for an acknowledgement from the broker after sending every message. If the acknowledgment is not received, the producer treats the sending operation as a failure. | +| Async send | The producer puts a message in a blocking queue and returns immediately. The client library sends the message to the broker in the background. If the queue is full (you can [configure](reference-configuration.md#broker) the maximum size), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. | + +### Access mode + +You can have different types of access modes on topics for producers. + +|Access mode | Description +|---|--- +`Shared`|Multiple producers can publish on a topic.

    This is the **default** setting. +`Exclusive`|Only one producer can publish on a topic.

    If there is already a producer connected, other producers trying to publish on this topic get errors immediately.

    The “old” producer is evicted and a “new” producer is selected to be the next exclusive producer if the “old” producer experiences a network partition with the broker. +`WaitForExclusive`|If there is already a producer connected, the producer creation is pending (rather than timing out) until the producer gets the `Exclusive` access.

    The producer that succeeds in becoming the exclusive one is treated as the leader. Consequently, if you want to implement the leader election scheme for your application, you can use this access mode. + +:::note + +Once an application creates a producer with `Exclusive` or `WaitForExclusive` access mode successfully, the instance of this application is guaranteed to be the **only writer** to the topic. Any other producers trying to produce messages on this topic will either get errors immediately or have to wait until they get the `Exclusive` access. +For more information, see [PIP 68: Exclusive Producer](https://github.com/apache/pulsar/wiki/PIP-68:-Exclusive-Producer). + +::: + +You can set producer access mode through Java Client API. For more information, see `ProducerAccessMode` in [ProducerBuilder.java](https://github.com/apache/pulsar/blob/fc5768ca3bbf92815d142fe30e6bfad70a1b4fc6/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/ProducerBuilder.java) file. + + +### Compression + +You can compress messages published by producers during transportation. Pulsar currently supports the following types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) +* [ZSTD](https://facebook.github.io/zstd/) +* [SNAPPY](https://google.github.io/snappy/) + +### Batching + +When batching is enabled, the producer accumulates and sends a batch of messages in a single request. The batch size is defined by the maximum number of messages and the maximum publish latency. Therefore, the backlog size represents the total number of batches instead of the total number of messages. + +In Pulsar, batches are tracked and stored as single units rather than as individual messages. Consumer unbundles a batch into individual messages. However, scheduled messages (configured through the `deliverAt` or the `deliverAfter` parameter) are always sent as individual messages even batching is enabled. + +In general, a batch is acknowledged when all of its messages are acknowledged by a consumer. It means that when **not all** batch messages are acknowledged, then unexpected failures, negative acknowledgements, or acknowledgement timeouts can result in a redelivery of all messages in this batch. + +To avoid redelivering acknowledged messages in a batch to the consumer, Pulsar introduces batch index acknowledgement since Pulsar 2.6.0. When batch index acknowledgement is enabled, the consumer filters out the batch index that has been acknowledged and sends the batch index acknowledgement request to the broker. The broker maintains the batch index acknowledgement status and tracks the acknowledgement status of each batch index to avoid dispatching acknowledged messages to the consumer. The batch is deleted when all indices of the messages in it are acknowledged. + +By default, batch index acknowledgement is disabled (`acknowledgmentAtBatchIndexLevelEnabled=false`). You can enable batch index acknowledgement by setting the `acknowledgmentAtBatchIndexLevelEnabled` parameter to `true` at the broker side. Enabling batch index acknowledgement results in more memory overheads. + +### Chunking +Message chunking enables Pulsar to process large payload messages by splitting the message into chunks at the producer side and aggregating chunked messages at the consumer side. + +With message chunking enabled, when the size of a message exceeds the allowed maximum payload size (the `maxMessageSize` parameter of broker), the workflow of messaging is as follows: +1. The producer splits the original message into chunked messages and publishes them with chunked metadata to the broker separately and in order. +2. The broker stores the chunked messages in one managed-ledger in the same way as that of ordinary messages, and it uses the `chunkedMessageRate` parameter to record chunked message rate on the topic. +3. The consumer buffers the chunked messages and aggregates them into the receiver queue when it receives all the chunks of a message. +4. The client consumes the aggregated message from the receiver queue. + +**Limitations:** +- Chunking is only available for persisted topics. +- Chunking is only available for the exclusive and failover subscription types. +- Chunking cannot be enabled simultaneously with batching. + +#### Handle consecutive chunked messages with one ordered consumer + +The following figure shows a topic with one producer which publishes a large message payload in chunked messages along with regular non-chunked messages. The producer publishes message M1 in three chunks labeled M1-C1, M1-C2 and M1-C3. The broker stores all the three chunked messages in the managed-ledger and dispatches them to the ordered (exclusive/failover) consumer in the same order. The consumer buffers all the chunked messages in memory until it receives all the chunked messages, aggregates them into one message and then hands over the original message M1 to the client. + +![](/assets/chunking-01.png) + +#### Handle interwoven chunked messages with one ordered consumer + +When multiple producers publish chunked messages into a single topic, the broker stores all the chunked messages coming from different producers in the same managed-ledger. The chunked messages in the managed-ledger can be interwoven with each other. As shown below, Producer 1 publishes message M1 in three chunks M1-C1, M1-C2 and M1-C3. Producer 2 publishes message M2 in three chunks M2-C1, M2-C2 and M2-C3. All chunked messages of the specific message are still in order but might not be consecutive in the managed-ledger. + +![](/assets/chunking-02.png) + +:::note + +In this case, interwoven chunked messages may bring some memory pressure to the consumer because the consumer keeps a separate buffer for each large message to aggregate all its chunks in one message. You can limit the maximum number of chunked messages a consumer maintains concurrently by configuring the `maxPendingChunkedMessage` parameter. When the threshold is reached, the consumer drops pending messages by silently acknowledging them or asking the broker to redeliver them later, optimizing memory utilization. + +::: + +#### Enable Message Chunking + +**Prerequisite:** Disable batching by setting the `enableBatching` parameter to `false`. + +The message chunking feature is OFF by default. +To enable message chunking, set the `chunkingEnabled` parameter to `true` when creating a producer. + +:::note + +If the consumer fails to receive all chunks of a message within a specified time period, it expires incomplete chunks. The default value is 1 minute. For more information about the `expireTimeOfIncompleteChunkedMessage` parameter, refer to [org.apache.pulsar.client.api](/api/client/). + +::: + +## Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +A consumer sends a [flow permit request](developing-binary-protocol.md#flow-control) to a broker to get messages. There is a queue at the consumer side to receive messages pushed from the broker. You can configure the queue size with the [`receiverQueueSize`](client-libraries-java.md#configure-consumer) parameter. The default size is `1000`). Each time `consumer.receive()` is called, a message is dequeued from the buffer. + +### Receive modes + +Messages are received from [brokers](reference-terminology.md#broker) either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive is blocked until a message is available. | +| Async receive | An async receive returns immediately with a future value—for example, a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java—that completes once a new message is available. | + +### Listeners + +Client libraries provide listener implementation for consumers. For example, the [Java client](client-libraries-java.md) provides a {@inject: javadoc:MesssageListener:/client/org/apache/pulsar/client/api/MessageListener} interface. In this interface, the `received` method is called whenever a new message is received. + +### Acknowledgement + +The consumer sends an acknowledgement request to the broker after it consumes a message successfully. Then, this consumed message will be permanently stored, and be deleted only after all the subscriptions have acknowledged it. If you want to store the messages that have been acknowledged by a consumer, you need to configure the [message retention policy](concepts-messaging.md#message-retention-and-expiry). + +For batch messages, you can enable batch index acknowledgement to avoid dispatching acknowledged messages to the consumer. For details about batch index acknowledgement, see [batching](#batching). + +Messages can be acknowledged in one of the following two ways: + +- Being acknowledged individually. With individual acknowledgement, the consumer acknowledges each message and sends an acknowledgement request to the broker. +- Being acknowledged cumulatively. With cumulative acknowledgement, the consumer **only** acknowledges the last message it received. All messages in the stream up to (and including) the provided message are not redelivered to that consumer. + +If you want to acknowledge messages individually, you can use the following API. + +```java + +consumer.acknowledge(msg); + +``` + +If you want to acknowledge messages cumulatively, you can use the following API. + +```java + +consumer.acknowledgeCumulative(msg); + +``` + +:::note + +Cumulative acknowledgement cannot be used in [Shared subscription type](#subscription-types), because Shared subscription type involves multiple consumers which have access to the same subscription. In Shared subscription type, messages are acknowledged individually. + +::: + +### Negative acknowledgement + +The [negative acknowledgement](#negative-acknowledgement) mechanism allows you to send a notification to the broker indicating the consumer did not process a message. When a consumer fails to consume a message and needs to re-consume it, the consumer sends a negative acknowledgement (nack) to the broker, triggering the broker to redeliver this message to the consumer. + +Messages are negatively acknowledged individually or cumulatively, depending on the consumption subscription type. + +In Exclusive and Failover subscription types, consumers only negatively acknowledge the last message they receive. + +In Shared and Key_Shared subscription types, consumers can negatively acknowledge messages individually. + +Be aware that negative acknowledgments on ordered subscription types, such as Exclusive, Failover and Key_Shared, might cause failed messages being sent to consumers out of the original order. + +If you are going to use negative acknowledgment on a message, make sure it is negatively acknowledged before the acknowledgment timeout. + +Use the following API to negatively acknowledge message consumption. + +```java + +Consumer consumer = pulsarClient.newConsumer() + .topic(topic) + .subscriptionName("sub-negative-ack") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .negativeAckRedeliveryDelay(2, TimeUnit.SECONDS) // the default value is 1 min + .subscribe(); + +Message message = consumer.receive(); + +// call the API to send negative acknowledgement +consumer.negativeAcknowledge(message); + +message = consumer.receive(); +consumer.acknowledge(message); + +``` + +To redeliver messages with different delays, you can use the **redelivery backoff mechanism** by setting the number of retries to deliver the messages. +Use the following API to enable `Negative Redelivery Backoff`. + +```java + +Consumer consumer = pulsarClient.newConsumer() + .topic(topic) + .subscriptionName("sub-negative-ack") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .negativeAckRedeliveryBackoff(MultiplierRedeliveryBackoff.builder() + .minDelayMs(1000) + .maxDelayMs(60 * 1000) + .build()) + .subscribe(); + +``` + +The message redelivery behavior should be as follows. + +Redelivery count | Redelivery delay +:--------------------|:----------- +1 | 10 + 1 seconds +2 | 10 + 2 seconds +3 | 10 + 4 seconds +4 | 10 + 8 seconds +5 | 10 + 16 seconds +6 | 10 + 32 seconds +7 | 10 + 60 seconds +8 | 10 + 60 seconds +:::note + +If batching is enabled, all messages in one batch are redelivered to the consumer. + +::: + +### Acknowledgement timeout + +The acknowledgement timeout mechanism allows you to set a time range during which the client tracks the unacknowledged messages. After this acknowledgement timeout (`ackTimeout`) period, the client sends `redeliver unacknowledged messages` request to the broker, thus the broker resends the unacknowledged messages to the consumer. + +You can configure the acknowledgement timeout mechanism to redeliver the message if it is not acknowledged after `ackTimeout` or to execute a timer task to check the acknowledgement timeout messages during every `ackTimeoutTickTime` period. + +You can also use the redelivery backoff mechanism, redeliver messages with different delays by setting the number +of times the messages is retried. + +If you want to use redelivery backoff, you can use the following API. + +```java + +consumer.ackTimeout(10, TimeUnit.SECOND) + .ackTimeoutRedeliveryBackoff(MultiplierRedeliveryBackoff.builder() + .minDelayMs(1000) + .maxDelayMs(60000) + .multiplier(2).build()) + +``` + +The message redelivery behavior should be as follows. + +Redelivery count | Redelivery delay +:--------------------|:----------- +1 | 10 + 1 seconds +2 | 10 + 2 seconds +3 | 10 + 4 seconds +4 | 10 + 8 seconds +5 | 10 + 16 seconds +6 | 10 + 32 seconds +7 | 10 + 60 seconds +8 | 10 + 60 seconds + +:::note + +- If batching is enabled, all messages in one batch are redelivered to the consumer. +- Compared with acknowledgement timeout, negative acknowledgement is preferred. First, it is difficult to set a timeout value. Second, a broker resends messages when the message processing time exceeds the acknowledgement timeout, but these messages might not need to be re-consumed. + +::: + +Use the following API to enable acknowledgement timeout. + +```java + +Consumer consumer = pulsarClient.newConsumer() + .topic(topic) + .ackTimeout(2, TimeUnit.SECONDS) // the default value is 0 + .ackTimeoutTickTime(1, TimeUnit.SECONDS) + .subscriptionName("sub") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +Message message = consumer.receive(); + +// wait at least 2 seconds +message = consumer.receive(); +consumer.acknowledge(message); + +``` + +### Retry letter topic + +The retry letter topic allows you to store the messages that failed to be consumed and retry consuming them later. With this method, you can customize the interval at which the messages are redelivered. Consumers on the original topic are automatically subscribed to the retry letter topic as well. Once the maximum number of retries has been reached, the unconsumed messages are moved to a [dead letter topic](#dead-letter-topic) for manual processing. + +The diagram below illustrates the concept of the retry letter topic. +![](/assets/retry-letter-topic.svg) + +The intention of using retry letter topic is different from using [delayed message delivery](#delayed-message-delivery), even though both are aiming to consume a message later. Retry letter topic serves failure handling through message redelivery to ensure critical data is not lost, while delayed message delivery is intended to deliver a message with a specified time of delay. + +By default, automatic retry is disabled. You can set `enableRetry` to `true` to enable automatic retry on the consumer. + +Use the following API to consume messages from a retry letter topic. When the value of `maxRedeliverCount` is reached, the unconsumed messages are moved to a dead letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .enableRetry(true) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` + +The default retry letter topic uses this format: + +``` + +--RETRY + +``` + +Use the Java client to specify the name of the retry letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .enableRetry(true) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .retryLetterTopic("my-retry-letter-topic-name") + .build()) + .subscribe(); + +``` + +The messages in the retry letter topic contain some special properties that are automatically created by the client. + +Special property | Description +:--------------------|:----------- +`REAL_TOPIC` | The real topic name. +`ORIGIN_MESSAGE_ID` | The origin message ID. It is crucial for message tracking. +`RECONSUMETIMES` | The number of retries to consume messages. +`DELAY_TIME` | Message retry interval in milliseconds. +**Example** + +``` + +REAL_TOPIC = persistent://public/default/my-topic +ORIGIN_MESSAGE_ID = 1:0:-1:0 +RECONSUMETIMES = 6 +DELAY_TIME = 3000 + +``` + +Use the following API to store the messages in a retrial queue. + +```java + +consumer.reconsumeLater(msg, 3, TimeUnit.SECONDS); + +``` + +Use the following API to add custom properties for the `reconsumeLater` function. In the next attempt to consume, custom properties can be get from message#getProperty. + +```java + +Map customProperties = new HashMap(); +customProperties.put("custom-key-1", "custom-value-1"); +customProperties.put("custom-key-2", "custom-value-2"); +consumer.reconsumeLater(msg, customProperties, 3, TimeUnit.SECONDS); + +``` + +:::note + +* Currently, retry letter topic is enabled in Shared subscription types. +* Compared with negative acknowledgment, retry letter topic is more suitable for messages that require a large number of retries with a configurable retry interval. Because messages in the retry letter topic are persisted to BookKeeper, while messages that need to be retried due to negative acknowledgment are cached on the client side. + +::: + +### Dead letter topic + +Dead letter topic allows you to continue message consumption even some messages are not consumed successfully. The messages that are failed to be consumed are stored in a specific topic, which is called dead letter topic. You can decide how to handle the messages in the dead letter topic. + +Enable dead letter topic in a Java client using the default dead letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` + +The default dead letter topic uses this format: + +``` + +--DLQ + +``` + +Use the Java client to specify the name of the dead letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("my-dead-letter-topic-name") + .build()) + .subscribe(); + +``` + +By default, there is no subscription during a DLQ topic creation. Without a just-in-time subscription to the DLQ topic, you may lose messages. To automatically create an initial subscription for the DLQ, you can specify the `initialSubscriptionName` parameter. If this parameter is set but the broker's `allowAutoSubscriptionCreation` is disabled, the DLQ producer will fail to be created. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("my-dead-letter-topic-name") + .initialSubscriptionName("init-sub") + .build()) + .subscribe(); + +``` + +Dead letter topic serves message redelivery, which is triggered by [acknowledgement timeout](#acknowledgement-timeout) or [negative acknowledgement](#negative-acknowledgement) or [retry letter topic](#retry-letter-topic) . +:::note + +* Currently, dead letter topic is enabled in Shared and Key_Shared subscription types. + +::: + +## Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from producers to consumers. Topic names are URLs that have a well-defined structure: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](concepts-architecture-overview.md#persistent-storage) and [non-persistent](#non-persistent-topics). The default is persistent, so if you do not specify a type, the topic is persistent. With persistent topics, all messages are durably persisted on disks (if the broker is not standalone, messages are durably persisted on multiple disks), whereas data for non-persistent topics is not persisted to storage disks. +`tenant` | The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespaces) level. Each tenant has one or multiple namespaces. +`topic` | The final part of the name. Topic names have no special meaning in a Pulsar instance. + +> **No need to explicitly create new topics** +> You do not need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar creates that topic under the namespace provided in the [topic name](#topics) automatically. +> If no tenant or namespace is specified when a client creates a topic, the topic is created in the default tenant and namespace. You can also create a topic in a specified tenant and namespace, such as `persistent://my-tenant/my-namespace/my-topic`. `persistent://my-tenant/my-namespace/my-topic` means the `my-topic` topic is created in the `my-namespace` namespace of the `my-tenant` tenant. + +## Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant creates multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +## Subscriptions + +A subscription is a named configuration rule that determines how messages are delivered to consumers. Four subscription types are available in Pulsar: [exclusive](#exclusive), [shared](#shared), [failover](#failover), and [key_shared](#key_shared). These types are illustrated in the figure below. + +![Subscription types](/assets/pulsar-subscription-types.png) + +> **Pub-Sub or Queuing** +> In Pulsar, you can use different subscriptions flexibly. +> * If you want to achieve traditional "fan-out pub-sub messaging" among consumers, specify a unique subscription name for each consumer. It is exclusive subscription type. +> * If you want to achieve "message queuing" among consumers, share the same subscription name among multiple consumers(shared, failover, key_shared). +> * If you want to achieve both effects simultaneously, combine exclusive subscription type with other subscription types for consumers. + +### Subscription types + +When a subscription has no consumers, its subscription type is undefined. The type of a subscription is defined when a consumer connects to it, and the type can be changed by restarting all consumers with a different configuration. + +#### Exclusive + +In *Exclusive* type, only a single consumer is allowed to attach to the subscription. If multiple consumers subscribe to a topic using the same subscription, an error occurs. + +In the diagram below, only **Consumer A-0** is allowed to consume messages. + +> Exclusive is the default subscription type. + +![Exclusive subscriptions](/assets/pulsar-exclusive-subscriptions.png) + +#### Failover + +In *Failover* type, multiple consumers can attach to the same subscription. A master consumer is picked for non-partitioned topic or each partition of partitioned topic and receives messages. When the master consumer disconnects, all (non-acknowledged and subsequent) messages are delivered to the next consumer in line. + +For partitioned topics, broker will sort consumers by priority level and lexicographical order of consumer name. Then broker will try to evenly assigns topics to consumers with the highest priority level. + +For non-partitioned topic, broker will pick consumer in the order they subscribe to the non partitioned topic. + +In the diagram below, **Consumer-B-0** is the master consumer while **Consumer-B-1** would be the next consumer in line to receive messages if **Consumer-B-0** is disconnected. + +![Failover subscriptions](/assets/pulsar-failover-subscriptions.png) + +#### Shared + +In *shared* or *round robin* type, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram below, **Consumer-C-1** and **Consumer-C-2** are able to subscribe to the topic, but **Consumer-C-3** and others could as well. + +> **Limitations of Shared type** +> When using Shared type, be aware that: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with Shared type. + +![Shared subscriptions](/assets/pulsar-shared-subscriptions.png) + +#### Key_Shared + +In *Key_Shared* type, multiple consumers can attach to the same subscription. Messages are delivered in a distribution across consumers and message with same key or same ordering key are delivered to only one consumer. No matter how many times the message is re-delivered, it is delivered to the same consumer. When a consumer connected or disconnected will cause served consumer change for some key of message. + +![Key_Shared subscriptions](/assets/pulsar-key-shared-subscriptions.png) + +Note that when the consumers are using the Key_Shared subscription type, you need to **disable batching** or **use key-based batching** for the producers. There are two reasons why the key-based batching is necessary for Key_Shared subscription type: +1. The broker dispatches messages according to the keys of the messages, but the default batching approach might fail to pack the messages with the same key to the same batch. +2. Since it is the consumers instead of the broker who dispatch the messages from the batches, the key of the first message in one batch is considered as the key of all messages in this batch, thereby leading to context errors. + +The key-based batching aims at resolving the above-mentioned issues. This batching method ensures that the producers pack the messages with the same key to the same batch. The messages without a key are packed into one batch and this batch has no key. When the broker dispatches messages from this batch, it uses `NON_KEY` as the key. In addition, each consumer is associated with **only one** key and should receive **only one message batch** for the connected key. By default, you can limit batching by configuring the number of messages that producers are allowed to send. + +Below are examples of enabling the key-based batching under the Key_Shared subscription type, with `client` being the Pulsar client that you created. + +````mdx-code-block + + + +``` + +Producer producer = client.newProducer() + .topic("my-topic") + .batcherBuilder(BatcherBuilder.KEY_BASED) + .create(); + +``` + + + + +``` + +ProducerConfiguration producerConfig; +producerConfig.setBatchingType(ProducerConfiguration::BatchingType::KeyBasedBatching); +Producer producer; +client.createProducer("my-topic", producerConfig, producer); + +``` + + + + +``` + +producer = client.create_producer(topic='my-topic', batching_type=pulsar.BatchingType.KeyBased) + +``` + + + + +```` + +> **Limitations of Key_Shared type** +> When you use Key_Shared type, be aware that: +> * You need to specify a key or orderingKey for messages. +> * You cannot use cumulative acknowledgment with Key_Shared type. + +### Subscription modes + +#### What is a subscription mode + +The subscription mode indicates the cursor type. + +- When a subscription is created, an associated cursor is created to record the last consumed position. + +- When a consumer of the subscription restarts, it can continue consuming from the last message it consumes. + +Subscription mode | Description | Note +|---|---|--- +`Durable`|The cursor is durable, which retains messages and persists the current position.
    If a broker restarts from a failure, it can recover the cursor from the persistent storage (BookKeeper), so that messages can continue to be consumed from the last consumed position.|`Durable` is the **default** subscription mode. +`NonDurable`|The cursor is non-durable.
    Once a broker stops, the cursor is lost and can never be recovered, so that messages **can not** continue to be consumed from the last consumed position.|Reader’s subscription mode is `NonDurable` in nature and it does not prevent data in a topic from being deleted. Reader’s subscription mode **can not** be changed. + +A [subscription](#concepts-messaging.md/#subscriptions) can have one or more consumers. When a consumer subscribes to a topic, it must specify the subscription name. A durable subscription and a non-durable subscription can have the same name, they are independent of each other. If a consumer specifies a subscription which does not exist before, the subscription is automatically created. + +#### When to use + +By default, messages of a topic without any durable subscriptions are marked as deleted. If you want to prevent the messages being marked as deleted, you can create a durable subscription for this topic. In this case, only acknowledged messages are marked as deleted. For more information, see [message retention and expiry](cookbooks-retention-expiry.md). + +#### How to use + +After a consumer is created, the default subscription mode of the consumer is `Durable`. You can change the subscription mode to `NonDurable` by making changes to the consumer’s configuration. + +````mdx-code-block + + + + +```java + + Consumer consumer = pulsarClient.newConsumer() + .topic("my-topic") + .subscriptionName("my-sub") + .subscriptionMode(SubscriptionMode.Durable) + .subscribe(); + +``` + + + + +```java + + Consumer consumer = pulsarClient.newConsumer() + .topic("my-topic") + .subscriptionName("my-sub") + .subscriptionMode(SubscriptionMode.NonDurable) + .subscribe(); + +``` + + + + +```` + +For how to create, check, or delete a durable subscription, see [manage subscriptions](admin-api-topics.md/#manage-subscriptions). + +## Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces). + +When subscribing to multiple topics, the Pulsar client automatically makes a call to the Pulsar API to discover the topics that match the regex pattern/list, and then subscribe to all of them. If any of the topics do not exist, the consumer auto-subscribes to them once the topics are created. + +> **No ordering guarantees across multiple topics** +> When a producer sends messages to a single topic, all messages are guaranteed to be read from that topic in the same order. However, these guarantees do not hold across multiple topics. So when a producer sends message to multiple topics, the order in which messages are read from those topics is not guaranteed to be the same. + +The following are multi-topic subscription examples for Java. + +```java + +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(allTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(someTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +``` + +For code examples, see [Java](client-libraries-java.md#multi-topic-subscriptions). + +## Partitioned topics + +Normal topics are served only by a single broker, which limits the maximum throughput of the topic. *Partitioned topics* are a special type of topic that are handled by multiple brokers, thus allowing for higher throughput. + +A partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](/assets/partitioning.png) + +The **Topic1** topic has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines each message should be published to which partition, while the [subscription type](#subscription-types) determines which messages go to which consumers. + +Decisions about routing and subscription modes can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription types work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} available: + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer will randomly pick one single partition and publish all the messages into that partition. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. +`CustomPartition` | Use custom message router implementation that will be called to determine the partition for a particular message. User can create a custom routing mode by using the [Java client](client-libraries-java.md) and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +### Ordering guarantee + +The ordering of messages is related to MessageRoutingMode and Message Key. Usually, user would want an ordering of Per-key-partition guarantee. + +If there is a key attached to message, the messages will be routed to corresponding partitions based on the hashing scheme specified by {@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} in {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder}, when using either `SinglePartition` or `RoundRobinPartition` mode. + +Ordering guarantee | Description | Routing Mode and Key +:------------------|:------------|:------------ +Per-key-partition | All the messages with the same key will be in order and be placed in same partition. | Use either `SinglePartition` or `RoundRobinPartition` mode, and Key is provided by each message. +Per-producer | All the messages from the same producer will be in order. | Use `SinglePartition` mode, and no Key is provided for each message. + +### Hashing scheme + +{@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} is an enum that represent sets of standard hashing functions available when choosing the partition to use for a particular message. + +There are 2 types of standard hashing functions available: `JavaStringHash` and `Murmur3_32Hash`. +The default hashing function for producer is `JavaStringHash`. +Please pay attention that `JavaStringHash` is not useful when producers can be from different multiple language clients, under this use case, it is recommended to use `Murmur3_32Hash`. + + + +## Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](concepts-architecture-overview.md#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](concepts-architecture-overview.md#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the `pulsar-admin topics` command. For more information, see [`pulsar-admin`](/tools/pulsar-admin/). + +### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is delivered to connected brokers. Producers thus see comparatively low publish latency with non-persistent topic. + +### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription types---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumers) for a non-persistent topic: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.newConsumer() + .topic(npTopic) + .subscriptionName(subscriptionName) + .subscribe(); + +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java + +Producer producer = client.newProducer() + .topic(npTopic) + .create(); + +``` + + +## System topic + +System topic is a predefined topic for internal use within Pulsar. It can be either persistent or non-persistent topic. + +System topics serve to implement certain features and eliminate dependencies on third-party components, such as transactions, heartbeat detections, topic-level policies, and resource group services. System topics empower the implementation of these features to be simplified, dependent, and flexible. Take heartbeat detections for example, you can leverage the system topic for healthcheck to internally enable producer/reader to procude/consume messages under the heartbeat namespace, which can detect whether the current service is still alive. + +There are diverse system topics depending on namespaces. The following table outlines the available system topics for each specific namespace. + +| Namespace | TopicName | Domain | Count | Usage | +|-----------|-----------|--------|-------|-------| +| pulsar/system | `transaction_coordinator_assign_${id}` | Persistent | Default 16 | Transaction coordinator | +| pulsar/system | `_transaction_log${tc_id}` | Persistent | Default 16 | Transaction log | +| pulsar/system | `resource-usage` | Non-persistent | Default 4 | Resource group service | +| host/port | `heartbeat` | Persistent | 1 | Heartbeat detection | +| User-defined-ns | [`__change_events`](concepts-multi-tenancy.md#namespace-change-events-and-topic-level-policies) | Persistent | Default 4 | Topic events | +| User-defined-ns | `__transaction_buffer_snapshot` | Persistent | One per namespace | Transaction buffer snapshots | +| User-defined-ns | `${topicName}__transaction_pending_ack` | Persistent | One per every topic subscription acknowledged with transactions | Acknowledgements with transactions | + +:::note + +* You cannot create any system topics. +* By default, system topics are disabled. To enable system topics, you need to change the following configurations in the `conf/broker.conf` or `conf/standalone.conf` file. + + ```conf + systemTopicEnabled=true + topicLevelPoliciesEnabled=true + ``` + +::: + + +## Message redelivery + +Apache Pulsar supports graceful failure handling and ensures critical data is not lost. Software will always have unexpected conditions and at times messages may not be delivered successfully. Therefore, it is important to have a built-in mechanism that handles failure, particularly in asynchronous messaging as highlighted in the following examples. + +- Consumers get disconnected from the database or the HTTP server. When this happens, the database is temporarily offline while the consumer is writing the data to it and the external HTTP server that the consumer calls is momentarily unavailable. +- Consumers get disconnected from a broker due to consumer crashes, broken connections, etc. As a consequence, the unacknowledged messages are delivered to other available consumers. + +Apache Pulsar avoids these and other message delivery failures using at-least-once delivery semantics that ensure Pulsar processes a message more than once. + +To utilize message redelivery, you need to enable this mechanism before the broker can resend the unacknowledged messages in Apache Pulsar client. You can activate the message redelivery mechanism in Apache Pulsar using three methods. + +- [Negative Acknowledgment](#negative-acknowledgement) +- [Acknowledgement Timeout](#acknowledgement-timeout) +- [Retry letter topic](#retry-letter-topic) + + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](concepts-architecture-overview.md#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](/assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dictates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Message deduplication + +Message duplication occurs when a message is [persisted](concepts-architecture-overview.md#persistent-storage) by Pulsar more than once. Message deduplication is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, even if the message is received more than once. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](/assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](concepts-architecture-overview.md#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level or the topic level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the [broker](reference-terminology.md#broker) level, so you do not need to modify your Pulsar client code. Instead, you only need to make administrative changes. For details, see [Managing message deduplication](cookbooks-deduplication.md). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide effectively-once processing semantics. Messaging systems that do not offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> You can find more in-depth information in [this post](https://www.splunk.com/en_us/blog/it/exactly-once-is-not-exactly-the-same.html). + +## Delayed message delivery +Delayed message delivery enables you to consume a message later. In this mechanism, a message is stored in BookKeeper. The `DelayedDeliveryTracker` maintains the time index (time -> messageId) in memory after the message is published to a broker. This message will be delivered to a consumer once the specified delay is over. + +Delayed message delivery only works in Shared subscription type. In Exclusive and Failover subscription types, the delayed message is dispatched immediately. + +The diagram below illustrates the concept of delayed message delivery: + +![Delayed Message Delivery](/assets/message_delay.png) + +A broker saves a message without any check. When a consumer consumes a message, if the message is set to delay, then the message is added to `DelayedDeliveryTracker`. A subscription checks and gets timeout messages from `DelayedDeliveryTracker`. + +### Broker +Delayed message delivery is enabled by default. You can change it in the broker configuration file as below: + +``` + +# Whether to enable the delayed delivery for messages. +# If disabled, messages are immediately delivered and there is no tracking overhead. +delayedDeliveryEnabled=true + +# Control the ticking time for the retry of delayed message delivery, +# affecting the accuracy of the delivery time compared to the scheduled time. +# Default is 1 second. +delayedDeliveryTickTimeMillis=1000 + +``` + +### Producer +The following is an example of delayed message delivery for a producer in Java: + +```java + +// message to be delivered at the configured delay interval +producer.newMessage().deliverAfter(3L, TimeUnit.Minute).value("Hello Pulsar!").send(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-multi-tenancy.md b/site2/website/versioned_docs/version-2.10.x/concepts-multi-tenancy.md new file mode 100644 index 0000000000000..93a59557b2efc --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-multi-tenancy.md @@ -0,0 +1,67 @@ +--- +id: concepts-multi-tenancy +title: Multi Tenancy +sidebar_label: "Multi Tenancy" +original_id: concepts-multi-tenancy +--- + +Pulsar was created from the ground up as a multi-tenant system. To support multi-tenancy, Pulsar has a concept of tenants. Tenants can be spread across clusters and can each have their own [authentication and authorization](security-overview.md) scheme applied to them. They are also the administrative unit at which storage quotas, [message TTL](cookbooks-retention-expiry.md#time-to-live-ttl), and isolation policies can be managed. + +The multi-tenant nature of Pulsar is reflected mostly visibly in topic URLs, which have this structure: + +```http + +persistent://tenant/namespace/topic + +``` + +As you can see, the tenant is the most basic unit of categorization for topics (more fundamental than the namespace and topic name). + +## Tenants + +To each tenant in a Pulsar instance you can assign: + +* An [authorization](security-authorization.md) scheme +* The set of [clusters](reference-terminology.md#cluster) to which the tenant's configuration applies + +## Namespaces + +Tenants and namespaces are two key concepts of Pulsar to support multi-tenancy. + +* Pulsar is provisioned for specified tenants with appropriate capacity allocated to the tenant. +* A namespace is the administrative unit nomenclature within a tenant. The configuration policies set on a namespace apply to all the topics created in that namespace. A tenant may create multiple namespaces via self-administration using the REST API and the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. For instance, a tenant with different applications can create a separate namespace for each application. + +Names for topics in the same namespace will look like this: + +```http + +persistent://tenant/app1/topic-1 + +persistent://tenant/app1/topic-2 + +persistent://tenant/app1/topic-3 + +``` + +### Namespace change events and topic-level policies + +Pulsar is a multi-tenant event streaming system. Administrators can manage the tenants and namespaces by setting policies at different levels. However, the policies, such as retention policy and storage quota policy, are only available at a namespace level. In many use cases, users need to set a policy at the topic level. The namespace change events approach is proposed for supporting topic-level policies in an efficient way. In this approach, Pulsar is used as an event log to store namespace change events (such as topic policy changes). This approach has a few benefits: +- Avoid using ZooKeeper and introducing more loads to ZooKeeper. +- Use Pulsar as an event log for propagating the policy cache. It can scale efficiently. +- Use Pulsar SQL to query the namespace changes and audit the system. + +Each namespace has a [system topic](concepts-messaging.md#system-topic) named `__change_events`. This system topic stores change events for a given namespace. The following figure illustrates how to leverage it to update topic-level policies. + +![Leverage the system topic to update topic-level policies](/assets/system-topic-for-topic-level-policies.svg) + +1. Pulsar Admin clients communicate with the Admin Restful API to update topic-level policies. +2. Any broker that receives the Admin HTTP request publishes a topic policy change event to the corresponding system topic (`__change_events`) of the namespace. +3. Each broker that owns a namespace bundle(s) subscribes to the system topic (`__change_events`) to receive the change events of the namespace. +4. Each broker applies the change events to its policy cache. +5. Once the policy cache is updated, the broker sends the response back to the Pulsar Admin clients. + +:::note + +By default, the system topic is disabled. To enable topic-level policy (`topicLevelPoliciesEnabled`=`true`), you need to enable the system topic by setting `systemtopicenabled` to `true` in the `conf/broker.conf` or `conf/standalone.conf` file. + +::: \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-multiple-advertised-listeners.md b/site2/website/versioned_docs/version-2.10.x/concepts-multiple-advertised-listeners.md new file mode 100644 index 0000000000000..f2e1ae0aadc7c --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-multiple-advertised-listeners.md @@ -0,0 +1,44 @@ +--- +id: concepts-multiple-advertised-listeners +title: Multiple advertised listeners +sidebar_label: "Multiple advertised listeners" +original_id: concepts-multiple-advertised-listeners +--- + +When a Pulsar cluster is deployed in the production environment, it may require to expose multiple advertised addresses for the broker. For example, when you deploy a Pulsar cluster in Kubernetes and want other clients, which are not in the same Kubernetes cluster, to connect to the Pulsar cluster, you need to assign a broker URL to external clients. But clients in the same Kubernetes cluster can still connect to the Pulsar cluster through the internal network of Kubernetes. + +## Advertised listeners + +To ensure clients in both internal and external networks can connect to a Pulsar cluster, Pulsar introduces `advertisedListeners` and `internalListenerName` configuration options into the [broker configuration file](reference-configuration.md#broker) to ensure that the broker supports exposing multiple advertised listeners and support the separation of internal and external network traffic. + +- The `advertisedListeners` is used to specify multiple advertised listeners. The broker uses the listener as the broker identifier in the load manager and the bundle owner data. The `advertisedListeners` is formatted as `:pulsar://:, :pulsar+ssl://:`. You can set up the `advertisedListeners` like +`advertisedListeners=internal:pulsar://192.168.1.11:6660,internal:pulsar+ssl://192.168.1.11:6651`. + +- The `internalListenerName` is used to specify the internal service URL that the broker uses. You can specify the `internalListenerName` by choosing one of the `advertisedListeners`. The broker uses the listener name of the first advertised listener as the `internalListenerName` if the `internalListenerName` is absent. + +After setting up the `advertisedListeners`, clients can choose one of the listeners as the service URL to create a connection to the broker as long as the network is accessible. However, if the client creates producers or consumer on a topic, the client must send a lookup requests to the broker for getting the owner broker, then connect to the owner broker to publish messages or consume messages. Therefore, You must allow the client to get the corresponding service URL with the same advertised listener name as the one used by the client. This helps keep client-side simple and secure. + +## Use multiple advertised listeners + +This example shows how a Pulsar client uses multiple advertised listeners. + +1. Configure multiple advertised listeners in the broker configuration file. + +```shell + +advertisedListeners={listenerName}:pulsar://xxxx:6650, +{listenerName}:pulsar+ssl://xxxx:6651 + +``` + +2. Specify the listener name for the client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://xxxx:6650") + .listenerName("external") + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-overview.md b/site2/website/versioned_docs/version-2.10.x/concepts-overview.md new file mode 100644 index 0000000000000..e8a2f4b9d321a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-overview.md @@ -0,0 +1,31 @@ +--- +id: concepts-overview +title: Pulsar Overview +sidebar_label: "Overview" +original_id: concepts-overview +--- + +Pulsar is a multi-tenant, high-performance solution for server-to-server messaging. Originally developed by Yahoo, Pulsar is under the stewardship of the [Apache Software Foundation](https://www.apache.org/). + +Key features of Pulsar are listed below: + +* Native support for multiple clusters in a Pulsar instance, with seamless [geo-replication](administration-geo.md) of messages across clusters. +* Very low publish and end-to-end latency. +* Seamless scalability to over a million topics. +* A simple [client API](concepts-clients.md) with bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). +* Multiple [subscription types](concepts-messaging.md#subscription-types) ([exclusive](concepts-messaging.md#exclusive), [shared](concepts-messaging.md#shared), and [failover](concepts-messaging.md#failover)) for topics. +* Guaranteed message delivery with [persistent message storage](concepts-architecture-overview.md#persistent-storage) provided by [Apache BookKeeper](http://bookkeeper.apache.org/). +* A serverless light-weight computing framework [Pulsar Functions](functions-overview.md) offers the capability for stream-native data processing. +* A serverless connector framework [Pulsar IO](io-overview.md), which is built on Pulsar Functions, makes it easier to move data in and out of Apache Pulsar. +* [Tiered Storage](concepts-tiered-storage.md) offloads data from hot/warm storage to cold/long-term storage (such as S3 and GCS) when the data is aging out. + +## Contents + +- [Messaging Concepts](concepts-messaging.md) +- [Architecture Overview](concepts-architecture-overview.md) +- [Pulsar Clients](concepts-clients.md) +- [Geo Replication](concepts-replication.md) +- [Multi Tenancy](concepts-multi-tenancy.md) +- [Authentication and Authorization](concepts-authentication.md) +- [Topic Compaction](concepts-topic-compaction.md) +- [Tiered Storage](concepts-tiered-storage.md) diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-proxy-sni-routing.md b/site2/website/versioned_docs/version-2.10.x/concepts-proxy-sni-routing.md new file mode 100644 index 0000000000000..7eee6df5512a2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-proxy-sni-routing.md @@ -0,0 +1,180 @@ +--- +id: concepts-proxy-sni-routing +title: Proxy support with SNI routing +sidebar_label: "Proxy support with SNI routing" +original_id: concepts-proxy-sni-routing +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +A proxy server is an intermediary server that forwards requests from multiple clients to different servers across the Internet. The proxy server acts as a "traffic cop" in both forward and reverse proxy scenarios, and benefits your system such as load balancing, performance, security, auto-scaling, and so on. + +The proxy in Pulsar acts as a reverse proxy, and creates a gateway in front of brokers. Proxies such as Apache Traffic Server (ATS), HAProxy, Nginx, and Envoy are not supported in Pulsar. These proxy-servers support **SNI routing**. SNI routing is used to route traffic to a destination without terminating the SSL connection. Layer 4 routing provides greater transparency because the outbound connection is determined by examining the destination address in the client TCP packets. + +Pulsar clients (Java, C++, Python) support [SNI routing protocol](https://github.com/apache/pulsar/wiki/PIP-60:-Support-Proxy-server-with-SNI-routing), so you can connect to brokers through the proxy. This document walks you through how to set up the ATS proxy, enable SNI routing, and connect Pulsar client to the broker through the ATS proxy. + +## ATS-SNI Routing in Pulsar +To support [layer-4 SNI routing](https://docs.trafficserver.apache.org/en/latest/admin-guide/layer-4-routing.en.html) with ATS, the inbound connection must be a TLS connection. Pulsar client supports SNI routing protocol on TLS connection, so when Pulsar clients connect to broker through ATS proxy, Pulsar uses ATS as a reverse proxy. + +Pulsar supports SNI routing for geo-replication, so brokers can connect to brokers in other clusters through the ATS proxy. + +This section explains how to set up and use ATS as a reverse proxy, so Pulsar clients can connect to brokers through the ATS proxy using the SNI routing protocol on TLS connection. + +### Set up ATS Proxy for layer-4 SNI routing +To support layer 4 SNI routing, you need to configure the `records.conf` and `ssl_server_name.conf` files. + +![Pulsar client SNI](/assets/pulsar-sni-client.png) + +The [records.config](https://docs.trafficserver.apache.org/en/latest/admin-guide/files/records.config.en.html) file is located in the `/usr/local/etc/trafficserver/` directory by default. The file lists configurable variables used by the ATS. + +To configure the `records.config` files, complete the following steps. +1. Update TLS port (`http.server_ports`) on which proxy listens, and update proxy certs (`ssl.client.cert.path` and `ssl.client.cert.filename`) to secure TLS tunneling. +2. Configure server ports (`http.connect_ports`) used for tunneling to the broker. If Pulsar brokers are listening on `4443` and `6651` ports, add the brokers service port in the `http.connect_ports` configuration. + +The following is an example. + +``` + +# PROXY TLS PORT +CONFIG proxy.config.http.server_ports STRING 4443:ssl 4080 +# PROXY CERTS FILE PATH +CONFIG proxy.config.ssl.client.cert.path STRING /proxy-cert.pem +# PROXY KEY FILE PATH +CONFIG proxy.config.ssl.client.cert.filename STRING /proxy-key.pem + + +# The range of origin server ports that can be used for tunneling via CONNECT. # Traffic Server allows tunnels only to the specified ports. Supports both wildcards (*) and ranges (e.g. 0-1023). +CONFIG proxy.config.http.connect_ports STRING 4443 6651 + +``` + +The `ssl_server_name` file is used to configure TLS connection handling for inbound and outbound connections. The configuration is determined by the SNI values provided by the inbound connection. The file consists of a set of configuration items, and each is identified by an SNI value (`fqdn`). When an inbound TLS connection is made, the SNI value from the TLS negotiation is matched with the items specified in this file. If the values match, the values specified in that item override the default values. + +The following example shows mapping of the inbound SNI hostname coming from the client, and the actual broker service URL where request should be redirected. For example, if the client sends the SNI header `pulsar-broker1`, the proxy creates a TLS tunnel by redirecting request to the `pulsar-broker1:6651` service URL. + +``` + +server_config = { + { + fqdn = 'pulsar-broker-vip', + # Forward to Pulsar broker which is listening on 6651 + tunnel_route = 'pulsar-broker-vip:6651' + }, + { + fqdn = 'pulsar-broker1', + # Forward to Pulsar broker-1 which is listening on 6651 + tunnel_route = 'pulsar-broker1:6651' + }, + { + fqdn = 'pulsar-broker2', + # Forward to Pulsar broker-2 which is listening on 6651 + tunnel_route = 'pulsar-broker2:6651' + }, +} + +``` + +After you configure the `ssl_server_name.config` and `records.config` files, the ATS-proxy server handles SNI routing and creates TCP tunnel between the client and the broker. + +### Configure Pulsar-client with SNI routing +ATS SNI-routing works only with TLS. You need to enable TLS for the ATS proxy and brokers first, configure the SNI routing protocol, and then connect Pulsar clients to brokers through ATS proxy. Pulsar clients support SNI routing by connecting to the proxy, and sending the target broker URL to the SNI header. This process is processed internally. You only need to configure the following proxy configuration initially when you create a Pulsar client to use the SNI routing protocol. + +````mdx-code-block + + + + +```java + +String brokerServiceUrl = “pulsar+ssl://pulsar-broker-vip:6651/”; +String proxyUrl = “pulsar+ssl://ats-proxy:443”; +ClientBuilder clientBuilder = PulsarClient.builder() + .serviceUrl(brokerServiceUrl) + .tlsTrustCertsFilePath(TLS_TRUST_CERT_FILE_PATH) + .enableTls(true) + .allowTlsInsecureConnection(false) + .proxyServiceUrl(proxyUrl, ProxyProtocol.SNI) + .operationTimeout(1000, TimeUnit.MILLISECONDS); + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", TLS_CLIENT_CERT_FILE_PATH); +authParams.put("tlsKeyFile", TLS_CLIENT_KEY_FILE_PATH); +clientBuilder.authentication(AuthenticationTls.class.getName(), authParams); + +PulsarClient pulsarClient = clientBuilder.build(); + +``` + + + + +```c++ + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://ats-proxy:443", config); + +``` + + + + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://ats-proxy:443", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + + + + +```` + +### Pulsar geo-replication with SNI routing +You can use the ATS proxy for geo-replication. Pulsar brokers can connect to brokers in geo-replication by using SNI routing. To enable SNI routing for broker connection cross clusters, you need to configure SNI proxy URL to the cluster metadata. If you have configured SNI proxy URL in the cluster metadata, you can connect to broker cross clusters through the proxy over SNI routing. + +![Pulsar client SNI](/assets/pulsar-sni-geo.png) + +In this example, a Pulsar cluster is deployed into two separate regions, `us-west` and `us-east`. Both regions are configured with ATS proxy, and brokers in each region run behind the ATS proxy. We configure the cluster metadata for both clusters, so brokers in one cluster can use SNI routing and connect to brokers in other clusters through the ATS proxy. + +(a) Configure the cluster metadata for `us-east` with `us-east` broker service URL and `us-east` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://east-broker-vip:6651 \ +--url http://east-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://east-ats-proxy:443 + +``` + +(b) Configure the cluster metadata for `us-west` with `us-west` broker service URL and `us-west` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://west-broker-vip:6651 \ +--url http://west-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://west-ats-proxy:443 + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-replication.md b/site2/website/versioned_docs/version-2.10.x/concepts-replication.md new file mode 100644 index 0000000000000..1ac455c702832 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-replication.md @@ -0,0 +1,69 @@ +--- +id: concepts-replication +title: Geo Replication +sidebar_label: "Geo Replication" +original_id: concepts-replication +--- + +Regardless of industries, when an unforeseen event occurs and brings day-to-day operations to a halt, an organization needs a well-prepared disaster recovery plan to quickly restore service to clients. However, a disaster recovery plan usually requires a multi-datacenter deployment with geographically dispersed data centers. Such a multi-datacenter deployment requires a geo-replication mechanism to provide additional redundancy in case a data center fails. + +Pulsar's geo-replication mechanism is typically used for disaster recovery, enabling the replication of persistently stored message data across multiple data centers. For instance, your application is publishing data in one region and you would like to process it for consumption in other regions. With Pulsar’s geo-replication mechanism, messages can be produced and consumed in different geo-locations. + +The diagram below illustrates the process of [geo-replication](administration-geo.md). Whenever three producers (P1, P2 and P3) respectively publish messages to the T1 topic in three clusters, those messages are instantly replicated across clusters. Once the messages are replicated, two consumers (C1 and C2) can consume those messages from their clusters. + +![A typical geo-replication example with full-mesh pattern](/assets/full-mesh-replication.svg) + +## Replication mechanisms + +The geo-replication mechanism can be categorized into synchronous geo-replication and asynchronous geo-replication strategies. Pulsar supports both replication mechanisms. + +### Asynchronous geo-replication in Pulsar + +An asynchronous geo-replicated cluster is composed of multiple physical clusters set up in different datacenters. Messages produced on a Pulsar topic are first persisted to the local cluster and then replicated asynchronously to the remote clusters by brokers. + +![An example of asynchronous geo-replication mechanism](/assets/geo-replication-async.svg) + +In normal cases, when there are no connectivity issues, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, end-to-end delivery latency is defined by the network round-trip time (RTT) between the data centers. Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (for example, during a network partition). + +Asynchronous geo-replication provides lower latency but may result in weaker consistency guarantees due to the potential replication lag that some data hasn’t been replicated. + +### Synchronous geo-replication via BookKeeper + +In synchronous geo-replication, data is synchronously replicated to multiple data centers and the client has to wait for an acknowledgment from the other data centers. As illustrated below, when the client issues a write request to one cluster, the written data will be replicated to the other two data centers. The write request is only acknowledged to the client when the majority of data centers (in this example, at least 2 data centers) have acknowledged that the write has been persisted. + +![An example of synchronous geo-replication mechanism](/assets/geo-replication-sync.svg) + +Synchronous geo-replication in Pulsar is achieved by BookKeeper. A synchronous geo-replicated cluster consists of a cluster of bookies and a cluster of brokers that run in multiple data centers, and a global Zookeeper installation (a ZooKeeper ensemble is running across multiple data centers). You need to configure a BookKeeper region-aware placement policy to store data across multiple data centers and guarantee availability constraints on writes. + +Synchronous geo-replication provides the highest availability and also guarantees stronger data consistency between different data centers. However, your applications have to pay an extra latency penalty across data centers. + + +## Replication patterns + +Pulsar provides a great degree of flexibility for customizing your replication strategy. You can set up different replication patterns to serve your replication strategy for an application between multiple data centers. + +### Full-mesh replication + +Using full-mesh replication and applying the [selective message replication](administration-geo.md/#selective-replication), you can customize your replication strategies and topologies between any number of datacenters. + +![An example of full-mesh replication pattern](/assets/full-mesh-replication.svg) + +### Active-active replication + +Active-active replication is a variation of full-mesh replication, with only two data centers. Producers are able to run at any data center to produce messages, and consumers can consume all messages from all data centers. + +![An example of active-active replication pattern](/assets/active-active-replication.svg) + +For how to use active-active replication to migrate data between clusters, refer to [here](administration-geo.md/#migrate-data-between-clusters-using-geo-replication). + +### Active-standby replication + +Active-standby replication is a variation of active-active replication. Producers send messages to the active data center while messages are replicated to the standby data center for backup. If the active data center goes down, the standby data center takes over and becomes the active one. + +![An example of active-standby replication pattern](/assets/active-standby-replication.svg) + +### Aggregation replication + +The aggregation replication pattern is typically used when replicating messages from the edge to the cloud. For example, assume you have 3 clusters in 3 fronting datacenters and one aggregated cluster in a central data center, and you want to replicate messages from multiple fronting datacenters to the central data center for aggregation purposes. You can then create an individual namespace for the topics used by each fronting data center and assign the aggregated data center to those namespaces. + +![An example of aggregation replication pattern](/assets/aggregation-replication.svg) diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-tiered-storage.md b/site2/website/versioned_docs/version-2.10.x/concepts-tiered-storage.md new file mode 100644 index 0000000000000..b45ccea5888bf --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-tiered-storage.md @@ -0,0 +1,18 @@ +--- +id: concepts-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: concepts-tiered-storage +--- + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from BookKeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](/assets/pulsar-tiered-storage.png) + +> Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3, Google Cloud Storage (GCS), and filesystem for [long term store](cookbooks-tiered-storage.md). Offloading to long term storage triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on BookKeeper, and the broker will copy the backlog data to long term storage. The original data will then be deleted from BookKeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-topic-compaction.md b/site2/website/versioned_docs/version-2.10.x/concepts-topic-compaction.md new file mode 100644 index 0000000000000..34b7ed7fbbd31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-topic-compaction.md @@ -0,0 +1,37 @@ +--- +id: concepts-topic-compaction +title: Topic Compaction +sidebar_label: "Topic Compaction" +original_id: concepts-topic-compaction +--- + +Pulsar was built with highly scalable [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data as a primary objective. Pulsar topics enable you to persistently store as many unacknowledged messages as you need while preserving message ordering. By default, Pulsar stores *all* unacknowledged/unprocessed messages produced on a topic. Accumulating many unacknowledged messages on a topic is necessary for many Pulsar use cases but it can also be very time intensive for Pulsar consumers to "rewind" through the entire log of messages. + +> For a more practical guide to topic compaction, see the [Topic compaction cookbook](cookbooks-compaction.md). + +For some use cases consumers don't need a complete "image" of the topic log. They may only need a few values to construct a more "shallow" image of the log, perhaps even just the most recent value. For these kinds of use cases Pulsar offers **topic compaction**. When you run compaction on a topic, Pulsar goes through a topic's backlog and removes messages that are *obscured* by later messages, i.e. it goes through the topic on a per-key basis and leaves only the most recent message associated with that key. + +Pulsar's topic compaction feature: + +* Allows for faster "rewind" through topic logs +* Applies only to [persistent topics](concepts-architecture-overview.md#persistent-storage) +* Triggered automatically when the backlog reaches a certain size or can be triggered manually via the command line. See the [Topic compaction cookbook](cookbooks-compaction.md) +* Is conceptually and operationally distinct from [retention and expiry](concepts-messaging.md#message-retention-and-expiry). Topic compaction *does*, however, respect retention. If retention has removed a message from the message backlog of a topic, the message will also not be readable from the compacted topic ledger. + +> #### Topic compaction example: the stock ticker +> An example use case for a compacted Pulsar topic would be a stock ticker topic. On a stock ticker topic, each message bears a timestamped dollar value for stocks for purchase (with the message key holding the stock symbol, e.g. `AAPL` or `GOOG`). With a stock ticker you may care only about the most recent value(s) of the stock and have no interest in historical data (i.e. you don't need to construct a complete image of the topic's sequence of messages per key). Compaction would be highly beneficial in this case because it would keep consumers from needing to rewind through obscured messages. + + +## How topic compaction works + +When topic compaction is triggered [via the CLI](cookbooks-compaction.md), Pulsar will iterate over the entire topic from beginning to end. For each key that it encounters the compaction routine will keep a record of the latest occurrence of that key. + +After that, the broker will create a new [BookKeeper ledger](concepts-architecture-overview.md#ledgers) and make a second iteration through each message on the topic. For each message, if the key matches the latest occurrence of that key, then the key's data payload, message ID, and metadata will be written to the newly created ledger. If the key doesn't match the latest then the message will be skipped and left alone. If any given message has an empty payload, it will be skipped and considered deleted (akin to the concept of [tombstones](https://en.wikipedia.org/wiki/Tombstone_(data_store)) in key-value databases). At the end of this second iteration through the topic, the newly created BookKeeper ledger is closed and two things are written to the topic's metadata: the ID of the BookKeeper ledger and the message ID of the last compacted message (this is known as the **compaction horizon** of the topic). Once this metadata is written compaction is complete. + +After the initial compaction operation, the Pulsar [broker](reference-terminology.md#broker) that owns the topic is notified whenever any future changes are made to the compaction horizon and compacted backlog. When such changes occur: + +* Clients (consumers and readers) that have read compacted enabled will attempt to read messages from a topic and either: + * Read from the topic like normal (if the message ID is greater than or equal to the compaction horizon) or + * Read beginning at the compaction horizon (if the message ID is lower than the compaction horizon) + + diff --git a/site2/website/versioned_docs/version-2.10.x/concepts-transactions.md b/site2/website/versioned_docs/version-2.10.x/concepts-transactions.md new file mode 100644 index 0000000000000..08490ba06b5d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/concepts-transactions.md @@ -0,0 +1,30 @@ +--- +id: transactions +title: Transactions +sidebar_label: "Overview" +original_id: transactions +--- + +Transactional semantics enable event streaming applications to consume, process, and produce messages in one atomic operation. In Pulsar, a producer or consumer can work with messages across multiple topics and partitions and ensure those messages are processed as a single unit. + +The following concepts help you understand Pulsar transactions. + +## Transaction coordinator and transaction log +The transaction coordinator maintains the topics and subscriptions that interact in a transaction. When a transaction is committed, the transaction coordinator interacts with the topic owner broker to complete the transaction. + +The transaction coordinator maintains the entire life cycle of transactions, and prevents a transaction from incorrect status. + +The transaction coordinator handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +All the transaction metadata is persisted in the transaction log. The transaction log is backed by a Pulsar topic. After the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +## Transaction ID +The transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +## Transaction buffer +Messages produced within a transaction are stored in the transaction buffer. The messages in transaction buffer are not materialized (visible) to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +## Pending acknowledge state +Message acknowledges within a transaction are maintained by the pending acknowledge state before the transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log. The pending acknowledge log is backed by a Pulsar topic. A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-bookkeepermetadata.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-bookkeepermetadata.md new file mode 100644 index 0000000000000..b0fa98dc3b65d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-bookkeepermetadata.md @@ -0,0 +1,21 @@ +--- +id: cookbooks-bookkeepermetadata +title: BookKeeper Ledger Metadata +original_id: cookbooks-bookkeepermetadata +--- + +Pulsar stores data on BookKeeper ledgers, you can understand the contents of a ledger by inspecting the metadata attached to the ledger. +Such metadata are stored on ZooKeeper and they are readable using BookKeeper APIs. + +Description of current metadata: + +| Scope | Metadata name | Metadata value | +| ------------- | ------------- | ------------- | +| All ledgers | application | 'pulsar' | +| All ledgers | component | 'managed-ledger', 'schema', 'compacted-topic' | +| Managed ledgers | pulsar/managed-ledger | name of the ledger | +| Cursor | pulsar/cursor | name of the cursor | +| Compacted topic | pulsar/compactedTopic | name of the original topic | +| Compacted topic | pulsar/compactedTo | id of the last compacted message | + + diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-compaction.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-compaction.md new file mode 100644 index 0000000000000..dfa314727241a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-compaction.md @@ -0,0 +1,142 @@ +--- +id: cookbooks-compaction +title: Topic compaction +sidebar_label: "Topic compaction" +original_id: cookbooks-compaction +--- + +Pulsar's [topic compaction](concepts-topic-compaction.md#compaction) feature enables you to create **compacted** topics in which older, "obscured" entries are pruned from the topic, allowing for faster reads through the topic's history (which messages are deemed obscured/outdated/irrelevant will depend on your use case). + +To use compaction: + +* You need to give messages keys, as topic compaction in Pulsar takes place on a *per-key basis* (i.e. messages are compacted based on their key). For a stock ticker use case, the stock symbol---e.g. `AAPL` or `GOOG`---could serve as the key (more on this [below](#when-should-i-use-compacted-topics)). Messages without keys will be left alone by the compaction process. +* Compaction can be configured to run [automatically](#configuring-compaction-to-run-automatically), or you can manually [trigger](#triggering-compaction-manually) compaction using the Pulsar administrative API. +* Your consumers must be [configured](#consumer-configuration) to read from compacted topics ([Java consumers](#java), for example, have a `readCompacted` setting that must be set to `true`). If this configuration is not set, consumers will still be able to read from the non-compacted topic. + + +> Compaction only works on messages that have keys (as in the stock ticker example the stock symbol serves as the key for each message). Keys can thus be thought of as the axis along which compaction is applied. Messages that don't have keys are simply ignored by compaction. + +## When should I use compacted topics? + +The classic example of a topic that could benefit from compaction would be a stock ticker topic through which consumers can access up-to-date values for specific stocks. Imagine a scenario in which messages carrying stock value data use the stock symbol as the key (`GOOG`, `AAPL`, `TWTR`, etc.). Compacting this topic would give consumers on the topic two options: + +* They can read from the "original," non-compacted topic in case they need access to "historical" values, i.e. the entirety of the topic's messages. +* They can read from the compacted topic if they only want to see the most up-to-date messages. + +Thus, if you're using a Pulsar topic called `stock-values`, some consumers could have access to all messages in the topic (perhaps because they're performing some kind of number crunching of all values in the last hour) while the consumers used to power the real-time stock ticker only see the compacted topic (and thus aren't forced to process outdated messages). Which variant of the topic any given consumer pulls messages from is determined by the consumer's [configuration](#consumer-configuration). + +> One of the benefits of compaction in Pulsar is that you aren't forced to choose between compacted and non-compacted topics, as the compaction process leaves the original topic as-is and essentially adds an alternate topic. In other words, you can run compaction on a topic and consumers that need access to the non-compacted version of the topic will not be adversely affected. + + +## Configuring compaction to run automatically + +Tenant administrators can configure a policy for compaction at the namespace level. The policy specifies how large the topic backlog can grow before compaction is triggered. + +For example, to trigger compaction when the backlog reaches 100MB: + +```bash + +$ bin/pulsar-admin namespaces set-compaction-threshold \ + --threshold 100M my-tenant/my-namespace + +``` + +Configuring the compaction threshold on a namespace will apply to all topics within that namespace. + +## Triggering compaction manually + +In order to run compaction on a topic, you need to use the [`topics compact`](reference-pulsar-admin.md#topics-compact) command for the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. Here's an example: + +```bash + +$ bin/pulsar-admin topics compact \ + persistent://my-tenant/my-namespace/my-topic + +``` + +The `pulsar-admin` tool runs compaction via the Pulsar {@inject: rest:REST:/} API. To run compaction in its own dedicated process, i.e. *not* through the REST API, you can use the [`pulsar compact-topic`](reference-cli-tools.md#pulsar-compact-topic) command. Here's an example: + +```bash + +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant-namespace/my-topic + +``` + +> Running compaction in its own process is recommended when you want to avoid interfering with the broker's performance. Broker performance should only be affected, however, when running compaction on topics with a large keyspace (i.e when there are many keys on the topic). The first phase of the compaction process keeps a copy of each key in the topic, which can create memory pressure as the number of keys grows. Using the `pulsar-admin topics compact` command to run compaction through the REST API should present no issues in the overwhelming majority of cases; using `pulsar compact-topic` should correspondingly be considered an edge case. + +The `pulsar compact-topic` command communicates with [ZooKeeper](https://zookeeper.apache.org) directly. In order to establish communication with ZooKeeper, though, the `pulsar` CLI tool will need to have a valid [broker configuration](reference-configuration.md#broker). You can either supply a proper configuration in `conf/broker.conf` or specify a non-default location for the configuration: + +```bash + +$ bin/pulsar compact-topic \ + --broker-conf /path/to/broker.conf \ + --topic persistent://my-tenant/my-namespace/my-topic + +# If the configuration is in conf/broker.conf +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant/my-namespace/my-topic + +``` + +#### When should I trigger compaction? + +How often you [trigger compaction](#triggering-compaction-manually) will vary widely based on the use case. If you want a compacted topic to be extremely speedy on read, then you should run compaction fairly frequently. + +## Consumer configuration + +Pulsar consumers and readers need to be configured to read from compacted topics. The sections below show you how to enable compacted topic reads for Pulsar's language clients. + +### Java + +In order to read from a compacted topic using a Java consumer, the `readCompacted` parameter must be set to `true`. Here's an example consumer for a compacted topic: + +```java + +Consumer compactedTopicConsumer = client.newConsumer() + .topic("some-compacted-topic") + .readCompacted(true) + .subscribe(); + +``` + +As mentioned above, topic compaction in Pulsar works on a *per-key basis*. That means that messages that you produce on compacted topics need to have keys (the content of the key will depend on your use case). Messages that don't have keys will be ignored by the compaction process. Here's an example Pulsar message with a key: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +``` + +The example below shows a message with a key being produced on a compacted Pulsar topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer compactedTopicProducer = client.newProducer() + .topic("some-compacted-topic") + .create(); + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +compactedTopicProducer.send(msg); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-deduplication.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-deduplication.md new file mode 100644 index 0000000000000..f7f9e3d7bb425 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-deduplication.md @@ -0,0 +1,151 @@ +--- +id: cookbooks-deduplication +title: Message deduplication +sidebar_label: "Message deduplication" +original_id: cookbooks-deduplication +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +When **Message deduplication** is enabled, it ensures that each message produced on Pulsar topics is persisted to disk *only once*, even if the message is produced more than once. Message deduplication is handled automatically on the server side. + +To use message deduplication in Pulsar, you need to configure your Pulsar brokers and clients. + +## How it works + +You can enable or disable message deduplication at the namespace level or the topic level. By default, it is disabled on all namespaces or topics. You can enable it in the following ways: + +* Enable deduplication for all namespaces/topics at the broker-level. +* Enable deduplication for a specific namespace with the `pulsar-admin namespaces` interface. +* Enable deduplication for a specific topic with the `pulsar-admin topics` interface. + +## Configure message deduplication + +You can configure message deduplication in Pulsar using the [`broker.conf`](reference-configuration.md#broker) configuration file. The following deduplication-related parameters are available. + +Parameter | Description | Default +:---------|:------------|:------- +`brokerDeduplicationEnabled` | Sets the default behavior for message deduplication in the Pulsar broker. If it is set to `true`, message deduplication is enabled on all namespaces/topics. If it is set to `false`, you have to enable or disable deduplication at the namespace level or the topic level. | `false` +`brokerDeduplicationMaxNumberOfProducers` | The maximum number of producers for which information is stored for deduplication purposes. | `10000` +`brokerDeduplicationEntriesInterval` | The number of entries after which a deduplication informational snapshot is taken. A larger interval leads to fewer snapshots being taken, though this lengthens the topic recovery time (the time required for entries published after the snapshot to be replayed). | `1000` +`brokerDeduplicationSnapshotIntervalSeconds`| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |`120` +`brokerDeduplicationProducerInactivityTimeoutMinutes` | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | `360` (6 hours) + +### Set default value at the broker-level + +By default, message deduplication is *disabled* on all Pulsar namespaces/topics. To enable it on all namespaces/topics, set the `brokerDeduplicationEnabled` parameter to `true` and re-start the broker. + +Even if you set the value for `brokerDeduplicationEnabled`, enabling or disabling via Pulsar admin CLI overrides the default settings at the broker-level. + +### Enable message deduplication + +Though message deduplication is disabled by default at the broker level, you can enable message deduplication for a specific namespace or topic using the [`pulsar-admin namespaces set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. You can use the `--enable`/`-e` flag and specify the namespace/topic. + +The following example shows how to enable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --enable # or just -e + +``` + +### Disable message deduplication + +Even if you enable message deduplication at the broker level, you can disable message deduplication for a specific namespace or topic using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. Use the `--disable`/`-d` flag and specify the namespace/topic. + +The following example shows how to disable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --disable # or just -d + +``` + +## Pulsar clients + +If you enable message deduplication in Pulsar brokers, you need complete the following tasks for your client producers: + +1. Specify a name for the producer. +1. Set the message timeout to `0` (namely, no timeout). + +The instructions for Java, Python, and C++ clients are different. + +````mdx-code-block + + + +To enable message deduplication on a [Java producer](client-libraries-java.md#producers), set the producer name using the `producerName` setter, and set the timeout to `0` using the `sendTimeout` setter. + +```java + +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; +import java.util.concurrent.TimeUnit; + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +Producer producer = pulsarClient.newProducer() + .producerName("producer-1") + .topic("persistent://public/default/topic-1") + .sendTimeout(0, TimeUnit.SECONDS) + .create(); + +``` + + + + +To enable message deduplication on a [Python producer](client-libraries-python.md#producers), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```python + +import pulsar + +client = pulsar.Client("pulsar://localhost:6650") +producer = client.create_producer( + "persistent://public/default/topic-1", + producer_name="producer-1", + send_timeout_millis=0) + +``` + + + + +To enable message deduplication on a [C++ producer](client-libraries-cpp.md#producer), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://some-tenant/ns1/topic-1"; +std::string producerName = "producer-1"; + +Client client(serviceUrl); + +ProducerConfiguration producerConfig; +producerConfig.setSendTimeout(0); +producerConfig.setProducerName(producerName); + +Producer producer; + +Result result = client.createProducer(topic, producerConfig, producer); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-encryption.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-encryption.md new file mode 100644 index 0000000000000..f0d8fb8735eb6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-encryption.md @@ -0,0 +1,184 @@ +--- +id: cookbooks-encryption +title: Pulsar Encryption +sidebar_label: "Encryption" +original_id: cookbooks-encryption +--- + +Pulsar encryption allows applications to encrypt messages at the producer and decrypt at the consumer. Encryption is performed using the public/private key pair configured by the application. Encrypted messages can only be decrypted by consumers with a valid key. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). The AES key(data key) is encrypted using application provided ECDSA/RSA key pair, as a result there is no need to share the secret with everyone. + +Key is a public/private key pair used for encryption/decryption. The producer key is the public key, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. This key is used to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key(in this case the consumer) will be able to decrypt the data key which is used to decrypt the message. + +A message can be encrypted with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message + +Pulsar does not store the encryption key anywhere in the pulsar service. If you lose/delete the private key, your message is irretrievably lost, and is unrecoverable + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Here are the steps to get started: + +1. Create your ECDSA or RSA public/private key pair. + +```shell + +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem + +``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which will be invoked by Pulsar client to load the key. +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key") +5. Add CryptoKeyReader implementation to producer/consumer config: conf.setCryptoKeyReader(keyReader) +6. Sample producer application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-tenant/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); + +``` + +7. Sample Consumer Application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-tenant//my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); + +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after a certain number of messages are published. The asymmetric public key is automatically fetched by producer every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enabling encryption at the producer application: +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. This can be done in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys +1. You grant access to one of the private keys from the pairs used by producer + +In some cases, the producer may want to encrypt the messages with multiple keys. For this, add all such keys to the config. Consumer will be able to decrypt the message, as long as it has access to at least one of the keys. + +E.g: If messages needs to be encrypted using 2 keys myapp.messagekey1 and myapp.messagekey2, + +```java + +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypting encrypted messages at the consumer application: +Consumers require access one of the private keys to decrypt messages produced by the producer. If you would like to receive encrypted messages, create a public/private key and give your public key to the producer application to encrypt messages using your public key. + +## Handling Failures: +* Producer/ Consumer loses access to the key + * Producer action will fail indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption failed due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. +Application will never be able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contain batch messages, client will not be able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application will notice backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip/discard backlogged messages. + diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-message-queue.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-message-queue.md new file mode 100644 index 0000000000000..eb43cbde5fb81 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-message-queue.md @@ -0,0 +1,127 @@ +--- +id: cookbooks-message-queue +title: Using Pulsar as a message queue +sidebar_label: "Message queue" +original_id: cookbooks-message-queue +--- + +Message queues are essential components of many large-scale data architectures. If every single work object that passes through your system absolutely *must* be processed in spite of the slowness or downright failure of this or that system component, there's a good chance that you'll need a message queue to step in and ensure that unprocessed data is retained---with correct ordering---until the required actions are taken. + +Pulsar is a great choice for a message queue because: + +* it was built with [persistent message storage](concepts-architecture-overview.md#persistent-storage) in mind +* it offers automatic load balancing across [consumers](reference-terminology.md#consumer) for messages on a topic (or custom load balancing if you wish) + +> You can use the same Pulsar installation to act as a real-time message bus and as a message queue if you wish (or just one or the other). You can set aside some topics for real-time purposes and other topics for message queue purposes (or use specific namespaces for either purpose if you wish). + + +# Client configuration changes + +To use a Pulsar [topic](reference-terminology.md#topic) as a message queue, you should distribute the receiver load on that topic across several consumers (the optimal number of consumers will depend on the load). Each consumer must: + +* Establish a [shared subscription](concepts-messaging.md#shared) and use the same subscription name as the other consumers (otherwise the subscription is not shared and the consumers can't act as a processing ensemble) +* If you'd like to have tight control over message dispatching across consumers, set the consumers' **receiver queue** size very low (potentially even to 0 if necessary). Each Pulsar [consumer](reference-terminology.md#consumer) has a receiver queue that determines how many messages the consumer will attempt to fetch at a time. A receiver queue of 1000 (the default), for example, means that the consumer will attempt to process 1000 messages from the topic's backlog upon connection. Setting the receiver queue to zero essentially means ensuring that each consumer is only doing one thing at a time. + + The downside to restricting the receiver queue size of consumers is that that limits the potential throughput of those consumers and cannot be used with [partitioned topics](reference-terminology.md#partitioned-topic). Whether the performance/control trade-off is worthwhile will depend on your use case. + +## Java clients + +Here's an example Java consumer configuration that uses a shared subscription: + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; +import org.apache.pulsar.client.api.SubscriptionType; + +String SERVICE_URL = "pulsar://localhost:6650"; +String TOPIC = "persistent://public/default/mq-topic-1"; +String subscription = "sub-1"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl(SERVICE_URL) + .build(); + +Consumer consumer = client.newConsumer() + .topic(TOPIC) + .subscriptionName(subscription) + .subscriptionType(SubscriptionType.Shared) + // If you'd like to restrict the receiver queue size + .receiverQueueSize(10) + .subscribe(); + +``` + +## Python clients + +Here's an example Python consumer configuration that uses a shared subscription: + +```python + +from pulsar import Client, ConsumerType + +SERVICE_URL = "pulsar://localhost:6650" +TOPIC = "persistent://public/default/mq-topic-1" +SUBSCRIPTION = "sub-1" + +client = Client(SERVICE_URL) +consumer = client.subscribe( + TOPIC, + SUBSCRIPTION, + # If you'd like to restrict the receiver queue size + receiver_queue_size=10, + consumer_type=ConsumerType.Shared) + +``` + +## C++ clients + +Here's an example C++ consumer configuration that uses a shared subscription: + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://public/defaultmq-topic-1"; +std::string subscription = "sub-1"; + +Client client(serviceUrl); + +ConsumerConfiguration consumerConfig; +consumerConfig.setConsumerType(ConsumerType.ConsumerShared); +// If you'd like to restrict the receiver queue size +consumerConfig.setReceiverQueueSize(10); + +Consumer consumer; + +Result result = client.subscribe(topic, subscription, consumerConfig, consumer); + +``` + +## Go clients + +Here is an example of a Go consumer configuration that uses a shared subscription: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "persistent://public/default/mq-topic-1", + SubscriptionName: "sub-1", + Type: pulsar.Shared, + ReceiverQueueSize: 10, // If you'd like to restrict the receiver queue size +}) +if err != nil { + log.Fatal(err) +} + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-non-persistent.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-non-persistent.md new file mode 100644 index 0000000000000..178301e86eb8d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-non-persistent.md @@ -0,0 +1,63 @@ +--- +id: cookbooks-non-persistent +title: Non-persistent messaging +sidebar_label: "Non-persistent messaging" +original_id: cookbooks-non-persistent +--- + +**Non-persistent topics** are Pulsar topics in which message data is *never* [persistently stored](concepts-architecture-overview.md#persistent-storage) and kept only in memory. This cookbook provides: + +* A basic [conceptual overview](#overview) of non-persistent topics +* Information about [configurable parameters](#configuration) related to non-persistent topics +* A guide to the [CLI interface](#cli) for managing non-persistent topics + +## Overview + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar [broker](reference-terminology.md#broker) or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more high-level information about non-persistent topics, see the [Concepts and Architecture](concepts-messaging.md#non-persistent-topics) documentation. + +## Using + +> In order to use non-persistent topics, they must be [enabled](#enabling) in your Pulsar broker configuration. + +In order to use non-persistent topics, you only need to differentiate them by name when interacting with them. This [`pulsar-client produce`](reference-cli-tools.md#pulsar-client-produce) command, for example, would produce one message on a non-persistent topic in a standalone cluster: + +```bash + +$ bin/pulsar-client produce non-persistent://public/default/example-np-topic \ + --num-produce 1 \ + --messages "This message will be stored only in memory" + +``` + +> For a more thorough guide to non-persistent topics from an administrative perspective, see the [Non-persistent topics](admin-api-topics.md) guide. + +## Enabling + +In order to enable non-persistent topics in a Pulsar broker, the [`enableNonPersistentTopics`](reference-configuration.md#broker-enableNonPersistentTopics) must be set to `true`. This is the default, and so you won't need to take any action to enable non-persistent messaging. + + +> #### Configuration for standalone mode +> If you're running Pulsar in standalone mode, the same configurable parameters are available but in the [`standalone.conf`](reference-configuration.md#standalone) configuration file. + +If you'd like to enable *only* non-persistent topics in a broker, you can set the [`enablePersistentTopics`](reference-configuration.md#broker-enablePersistentTopics) parameter to `false` and the `enableNonPersistentTopics` parameter to `true`. + +## Managing with cli + +Non-persistent topics can be managed using the [`pulsar-admin non-persistent`](reference-pulsar-admin.md#non-persistent) command-line interface. With that interface you can perform actions like [create a partitioned non-persistent topic](reference-pulsar-admin.md#non-persistent-create-partitioned-topic), get [stats](reference-pulsar-admin.md#non-persistent-stats) for a non-persistent topic, [list](reference-pulsar-admin.md) non-persistent topics under a namespace, and more. + +## Using with Pulsar clients + +You shouldn't need to make any changes to your Pulsar clients to use non-persistent messaging beyond making sure that you use proper [topic names](#using) with `non-persistent` as the topic type. + diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-partitioned.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-partitioned.md new file mode 100644 index 0000000000000..fb9ac354cc6d6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-partitioned.md @@ -0,0 +1,7 @@ +--- +id: cookbooks-partitioned +title: Partitioned topics +sidebar_label: "Partitioned Topics" +original_id: cookbooks-partitioned +--- +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-retention-expiry.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-retention-expiry.md new file mode 100644 index 0000000000000..bb268ecf67166 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-retention-expiry.md @@ -0,0 +1,520 @@ +--- +id: cookbooks-retention-expiry +title: Message retention and expiry +sidebar_label: "Message retention and expiry" +original_id: cookbooks-retention-expiry +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar brokers are responsible for handling messages that pass through Pulsar, including [persistent storage](concepts-architecture-overview.md#persistent-storage) of messages. By default, for each topic, brokers only retain messages that are in at least one backlog. A backlog is the set of unacknowledged messages for a particular subscription. As a topic can have multiple subscriptions, a topic can have multiple backlogs. + +As a consequence, no messages are retained (by default) on a topic that has not had any subscriptions created for it. + +(Note that messages that are no longer being stored are not necessarily immediately deleted, and may in fact still be accessible until the next ledger rollover. Because clients cannot predict when rollovers may happen, it is not wise to rely on a rollover not happening at an inconvenient point in time.) + +In Pulsar, you can modify this behavior, with namespace granularity, in two ways: + +* You can persistently store messages that are not within a backlog (because they've been acknowledged by on every existing subscription, or because there are no subscriptions) by setting [retention policies](#retention-policies). +* Messages that are not acknowledged within a specified timeframe can be automatically acknowledged, by specifying the [time to live](#time-to-live-ttl) (TTL). + +Pulsar's [admin interface](admin-api-overview.md) enables you to manage both retention policies and TTL with namespace granularity (and thus within a specific tenant and either on a specific cluster or in the [`global`](concepts-architecture-overview.md#global-cluster) cluster). + + +> #### Retention and TTL solve two different problems +> * Message retention: Keep the data for at least X hours (even if acknowledged) +> * Time-to-live: Discard data after some time (by automatically acknowledging) +> +> Most applications will want to use at most one of these. + + +## Retention policies + +By default, when a Pulsar message arrives at a broker, the message is stored until it has been acknowledged on all subscriptions, at which point it is marked for deletion. You can override this behavior and retain messages that have already been acknowledged on all subscriptions by setting a *retention policy* for all topics in a given namespace. Retention is based on both a *size limit* and a *time limit*. + +The diagram below illustrates the concept of message retention. +![](/assets/retention.svg) + +Retention policies are useful when you use the Reader interface. The Reader interface does not use acknowledgements, and messages do not exist within backlogs. It is required to configure retention for Reader-only use cases. + +When you set a retention policy on topics in a namespace, you must set **both** a *size limit* (via `defaultRetentionSizeInMB`) and a *time limit* (via `defaultRetentionTimeInMinutes`) . You can refer to the following table to set retention policies in `pulsar-admin` and Java. + +|Time limit|Size limit| Message retention | +|----------|----------|------------------------| +| -1 | -1 | Infinite retention | +| -1 | >0 | Based on the size limit | +| >0 | -1 | Based on the time limit | +| 0 | 0 | Disable message retention (by default) | +| 0 | >0 | Invalid | +| >0 | 0 | Invalid | +| >0 | >0 | Acknowledged messages or messages with no active subscription will not be retained when either time or size reaches the limit. | + +The retention settings apply to all messages on topics that do not have any subscriptions, or to messages that have been acknowledged by all subscriptions. The retention policy settings do not affect unacknowledged messages on topics with subscriptions. The unacknowledged messages are controlled by the backlog quota. + +When a retention limit on a topic is exceeded, the oldest message is marked for deletion until the set of retained messages falls within the specified limits again. + +### Defaults + +You can set message retention at instance level with the following two parameters: `defaultRetentionTimeInMinutes` and `defaultRetentionSizeInMB`. Both parameters are set to `0` by default. + +For more information of the two parameters, refer to the [`broker.conf`](reference-configuration.md#broker) configuration file. + +### Set retention policy + +You can set a retention policy for a namespace by specifying the namespace, a size limit and a time limit in `pulsar-admin`, REST API and Java. + +````mdx-code-block + + + +You can use the [`set-retention`](reference-pulsar-admin.md#namespaces-set-retention) subcommand and specify a namespace, a size limit using the `-s`/`--size` flag, and a time limit using the `-t`/`--time` flag. + +In the following example, the size limit is set to 10 GB and the time limit is set to 3 hours for each topic within the `my-tenant/my-ns` namespace. +- When the size of messages reaches 10 GB on a topic within 3 hours, the acknowledged messages will not be retained. +- After 3 hours, even if the message size is less than 10 GB, the acknowledged messages will not be retained. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 10G \ + --time 3h + +``` + +In the following example, the time is not limited and the size limit is set to 1 TB. The size limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 1T \ + --time -1 + +``` + +In the following example, the size is not limited and the time limit is set to 3 hours. The time limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time 3h + +``` + +To achieve infinite retention, set both values to `-1`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time -1 + +``` + +To disable the retention policy, set both values to `0`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 0 \ + --time 0 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + +:::note + +To disable the retention policy, you need to set both the size and time limit to `0`. Set either size or time limit to `0` is invalid. + +::: + + + + +```java + +int retentionTime = 10; // 10 minutes +int retentionSize = 500; // 500 megabytes +RetentionPolicies policies = new RetentionPolicies(retentionTime, retentionSize); +admin.namespaces().setRetention(namespace, policies); + +``` + + + + +```` + +### Get retention policy + +You can fetch the retention policy for a namespace by specifying the namespace. The output will be a JSON object with two keys: `retentionTimeInMinutes` and `retentionSizeInMB`. + +````mdx-code-block + + + +Use the [`get-retention`](reference-pulsar-admin.md#namespaces) subcommand and specify the namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-retention my-tenant/my-ns +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 500 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getRetention(namespace); + +``` + + + + +```` + +## Backlog quotas + +*Backlogs* are sets of unacknowledged messages for a topic that have been stored by bookies. Pulsar stores all unacknowledged messages in backlogs until they are processed and acknowledged. + +You can control the allowable size and/or time of backlogs, at the namespace level, using *backlog quotas*. Pulsar uses a quota to enforce a hard limit on the logical size of the backlogs in a topic. Backlog quota triggers an alert policy (for example, producer exception) once the quota limit is reached. + +The diagram below illustrates the concept of backlog quota. +![](/assets/backlog-quota.svg) + +Setting a backlog quota involves setting: + +* an allowable *size and/or time threshold* for each topic in the namespace +* a *retention policy* that determines which action the [broker](reference-terminology.md#broker) takes if the threshold is exceeded. + +The following retention policies are available: + +Policy | Action +:------|:------ +`producer_request_hold` | The broker will hold and not persist produce request payload +`producer_exception` | The broker will disconnect from the client by throwing an exception +`consumer_backlog_eviction` | The broker will begin discarding backlog messages + + +> #### Beware the distinction between retention policy types +> As you may have noticed, there are two definitions of the term "retention policy" in Pulsar, one that applies to persistent storage of messages not in backlogs, and one that applies to messages within backlogs. + + +Backlog quotas are handled at the namespace level. They can be managed via: + +### Set size/time thresholds and backlog retention policies + +You can set a size and/or time threshold and backlog retention policy for all of the topics in a [namespace](reference-terminology.md#namespace) by specifying the namespace, a size limit and/or a time limit in second, and a policy by name. + +````mdx-code-block + + + +Use the [`set-backlog-quota`](reference-pulsar-admin.md#namespaces) subcommand and specify a namespace, a size limit using the `-l`/`--limit` , `-lt`/`--limitTime` flag to limit backlog, a retention policy using the `-p`/`--policy` flag and a policy type using `-t`/`--type` (default is destination_storage). + +##### Example + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ + --limit 2G \ + --policy producer_request_hold + +``` + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +long sizeLimit = 2147483648L; +BacklogQuota.RetentionPolicy policy = BacklogQuota.RetentionPolicy.producer_request_hold; +BacklogQuota quota = new BacklogQuota(sizeLimit, policy); +admin.namespaces().setBacklogQuota(namespace, quota); + +``` + + + + +```` + +### Get backlog threshold and backlog retention policy + +You can see which size threshold and backlog retention policy has been applied to a namespace. + +````mdx-code-block + + + +Use the [`get-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-backlog-quotas) subcommand and specify a namespace. Here's an example: + +```shell + +$ pulsar-admin namespaces get-backlog-quotas my-tenant/my-ns +{ + "destination_storage": { + "limit" : 2147483648, + "policy" : "producer_request_hold" + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +Map quotas = + admin.namespaces().getBacklogQuotas(namespace); + +``` + + + + +```` + +### Remove backlog quotas + +````mdx-code-block + + + +Use the [`remove-backlog-quota`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-backlog-quota) subcommand and specify a namespace, use `t`/`--type` to specify backlog type to remove(default is destination_storage). Here's an example: + +```shell + +$ pulsar-admin namespaces remove-backlog-quota my-tenant/my-ns + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeBacklogQuota(namespace); + +``` + + + + +```` + +### Clear backlog + +#### pulsar-admin + +Use the [`clear-backlog`](reference-pulsar-admin.md#pulsar-admin-namespaces-clear-backlog) subcommand. + +##### Example + +```shell + +$ pulsar-admin namespaces clear-backlog my-tenant/my-ns + +``` + +By default, you will be prompted to ensure that you really want to clear the backlog for the namespace. You can override the prompt using the `-f`/`--force` flag. + +## Time to live (TTL) + +By default, Pulsar stores all unacknowledged messages forever. This can lead to heavy disk space usage in cases where a lot of messages are going unacknowledged. If disk space is a concern, you can set a time to live (TTL) that determines how long unacknowledged messages will be retained. + +The TTL parameter is like a stopwatch attached to each message that defines the amount of time a message is allowed to stay in the unacknowledged state. When the TTL expires, Pulsar automatically moves the message to the acknowledged state (and thus makes it ready for deletion). + +The diagram below illustrates the concept of TTL. +![](/assets/ttl.svg) + +### Set the TTL for a namespace + +````mdx-code-block + + + +Use the [`set-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-set-message-ttl) subcommand and specify a namespace and a TTL (in seconds) using the `-ttl`/`--messageTTL` flag. + +##### Example + +```shell + +$ pulsar-admin namespaces set-message-ttl my-tenant/my-ns \ + --messageTTL 120 # TTL of 2 minutes + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, ttlInSeconds); + +``` + + + + +```` + +### Get the TTL configuration for a namespace + +````mdx-code-block + + + +Use the [`get-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-message-ttl my-tenant/my-ns +60 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + + + + +```` + +### Remove the TTL configuration for a namespace + +````mdx-code-block + + + +Use the [`remove-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces remove-message-ttl my-tenant/my-ns + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + + + + +```` + +## Delete messages from namespaces + +When it comes to the physical storage size, message expiry and retention are just like two sides of the same coin. +* The backlog quota and TTL parameters prevent disk size from growing indefinitely, as Pulsar’s default behaviour is to persist unacknowledged messages. +* The retention policy allocates storage space to accommodate the messages that are supposed to be deleted by Pulsar by default. + +As a conclusion, the size of your physical storage should accommodate the sum of the backlog quota and the retention size. + +The message deletion rate (releasing rate of disk space) can be determined by multiple factors. + +- **Segment rollover period**: basically, the segment rollover period is how often a new segment is created. Once a new segment is created, the old segment will be deleted. By default, this happens either when you have written 50,000 entries (messages) or have waited 240 minutes. You can tune this in your broker. + +- **Entry log rollover period**: multiple ledgers in BookKeeper are interleaved into an [entry log](https://bookkeeper.apache.org/docs/4.11.1/getting-started/concepts/#entry-logs). In order for a ledger that has been deleted, the entry log must all be rolled over. +The entry log rollover period is configurable, but is purely based on the entry log size. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-settings). Once the entry log is rolled over, the entry log can be garbage collected. + +- **Garbage collection interval**: because entry logs have interleaved ledgers, to free up space, the entry logs need to be rewritten. The garbage collection interval is how often BookKeeper performs garbage collection. which is related to minor compaction and major compaction of entry logs. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-compaction-settings). + +The diagram below illustrates one of the cases that the consumed storage size is larger than the given limits for backlog and retention. Messages over the retention limit are kept because other messages in the same segment are still within retention period. +![](/assets/retention-storage-size.svg) + +If you do not have any retention period and that you never have much of a backlog, the upper limit for retained messages, which are acknowledged, equals to the Pulsar segment rollover period + entry log rollover period + (garbage collection interval * garbage collection ratios). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/cookbooks-tiered-storage.md b/site2/website/versioned_docs/version-2.10.x/cookbooks-tiered-storage.md new file mode 100644 index 0000000000000..b1deb135209a9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/cookbooks-tiered-storage.md @@ -0,0 +1,344 @@ +--- +id: cookbooks-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: cookbooks-tiered-storage +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be offloaded to long term storage, thereby freeing up space in BookKeeper and reducing storage costs. This cookbook walks you through using tiered storage in your Pulsar cluster. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [Google Cloud Storage](https://cloud.google.com/storage/)(GCS for short) +for long term storage. With Jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystem for long term storage. +With Hadoop, it is easy to add support for more filesystem in the future. + +## When should I use Tiered Storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm you can rerun it against your full user history. + +## The offloading mechanism + +A topic in Pulsar is backed by a log, known as a managed ledger. This log is composed of an ordered list of segments. Pulsar only every writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a segment oriented architecture. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The Tiered Storage offloading mechanism takes advantage of this segment oriented architecture. When offloading is requested, the segments of the log are copied, one-by-one, to tiered storage. All segments of the log, apart from the segment currently being written to can be offloaded. + +On the broker, the administrator must configure the bucket and credentials for the cloud storage service. +The configured bucket must exist before attempting to offload. If it does not exist, the offload operation will fail. + +Pulsar uses multi-part objects to upload the segment data. It is possible that a broker could crash while uploading the data. +We recommend you add a life cycle rule your bucket to expire incomplete multi-part upload after a day or two to avoid +getting charged for incomplete uploads. + +When ledgers are offloaded to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +## Configuring the offload driver + +Offloading is configured in ```broker.conf```. + +At a minimum, the administrator must configure the driver, the bucket and the authenticating credentials. +There is also some other knobs to configure, like the bucket region, the max block size in backed storage, etc. + +Currently we support driver of types: + +- `aws-s3`: [Simple Cloud Storage Service](https://aws.amazon.com/s3/) +- `google-cloud-storage`: [Google Cloud Storage](https://cloud.google.com/storage/) +- `filesystem`: [Filesystem Storage](http://hadoop.apache.org/) + +> Driver names are case-insensitive for driver's name. There is a third driver type, `s3`, which is identical to `aws-s3`, +> though it requires that you specify an endpoint url using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if +> using a S3 compatible data store, other than AWS. + +```conf + +managedLedgerOffloadDriver=aws-s3 + +``` + +### "aws-s3" Driver configuration + +#### Bucket and Region + +Buckets are the basic containers that hold your data. +Everything that you store in Cloud Storage must be contained in a bucket. +You can use buckets to organize your data and control access to your data, +but unlike directories and folders, you cannot nest buckets. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required +but a recommended configuration. If it is not configured, It will use the default region. + +With AWS S3, the default region is `US East (N. Virginia)`. Page [AWS Regions and Endpoints](https://docs.aws.amazon.com/general/latest/gr/rande.html) contains more information. + +```conf + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication with AWS + +To be able to access AWS S3, you need to authenticate with AWS S3. +Pulsar does not provide any direct means of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, they can be configured in a number of ways. + +1. Using ec2 instance metadata credentials + +If you are on AWS instance with an instance profile that provides credentials, Pulsar will use these credentials +if no other mechanism is provided + +2. Set the environment variables **AWS_ACCESS_KEY_ID** and **AWS_SECRET_ACCESS_KEY** in ```conf/pulsar_env.sh```. + +```bash + +export AWS_ACCESS_KEY_ID=ABC123456789 +export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +> \"export\" is important so that the variables are made available in the environment of spawned processes. + + +3. Add the Java system properties *aws.accessKeyId* and *aws.secretKey* to **PULSAR_EXTRA_OPTS** in `conf/pulsar_env.sh`. + +```bash + +PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacity.default=1000 -Dio.netty.recycler.linkCapacity=1024" + +``` + +4. Set the access credentials in ```~/.aws/credentials```. + +```conf + +[default] +aws_access_key_id=ABC123456789 +aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +5. Assuming an IAM role + +If you want to assume an IAM role, this can be done via specifying the following: + +```conf + +s3ManagedLedgerOffloadRole= +s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + +``` + +This will use the `DefaultAWSCredentialsProviderChain` for assuming this role. + +> The broker must be rebooted for credentials specified in pulsar_env to take effect. + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to AWS S3. + +- ```s3ManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of + a "part" sent during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```s3ManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for + each individual read when reading back data from AWS S3. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "google-cloud-storage" Driver configuration + +Buckets are the basic containers that hold your data. Everything that you store in +Cloud Storage must be contained in a bucket. You can use buckets to organize your data and +control access to your data, but unlike directories and folders, you cannot nest buckets. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required but +a recommended configuration. If it is not configured, It will use the default region. + +Regarding GCS, buckets are default created in the `us multi-regional location`, +page [Bucket Locations](https://cloud.google.com/storage/docs/bucket-locations) contains more information. + +```conf + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication with GCS + +The administrator needs to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in `broker.conf` +for the broker to be able to access the GCS service. `gcsManagedLedgerOffloadServiceAccountKeyFile` is +a Json file, containing the GCS credentials of a service account. +[Service Accounts section of this page](https://support.google.com/googleapi/answer/6158849) contains +more information of how to create this key file for authentication. More information about google cloud IAM +is available [here](https://cloud.google.com/storage/docs/access-control/iam). + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps: + +1. Open the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). +2. Select a project or create a new one. +3. Click **Create service account**. +4. In the **Create service account** window, type a name for the service account, and select **Furnish a new private key**. If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, also select **Enable G Suite Domain-wide Delegation**. +5. Click **Create**. + +> Notes: Make ensure that the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account in [here](https://cloud.google.com/storage/docs/access-control/iam). + +```conf + +gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/hello/Downloads/project-804d5e6a6f33.json" + +``` + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to GCS. + +- ```gcsManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of a "part" sent + during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```gcsManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for each individual + read when reading back data from GCS. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "filesystem" Driver configuration + + +#### Configure connection address + +You can configure the connection address in the `broker.conf` file. + +```conf + +fileSystemURI="hdfs://127.0.0.1:9000" + +``` + +#### Configure Hadoop profile path + +The configuration file is stored in the Hadoop profile path. It contains various settings, such as base path, authentication, and so on. + +```conf + +fileSystemProfilePath="../conf/filesystem_offload_core_site.xml" + +``` + +The model for storing topic data uses `org.apache.hadoop.io.MapFile`. You can use all of the configurations in `org.apache.hadoop.io.MapFile` for Hadoop. + +**Example** + +```conf + + + fs.defaultFS + + + + + hadoop.tmp.dir + pulsar + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + +``` + +For more information about the configurations in `org.apache.hadoop.io.MapFile`, see [Filesystem Storage](http://hadoop.apache.org/). +## Configuring offload to run automatically + +Namespace policies can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that the topic has stored on the pulsar cluster. Once the topic reaches the threshold, an offload operation will be triggered. Setting a negative value to the threshold will disable automatic offloading. Setting the threshold to 0 will cause the broker to offload data as soon as it possiby can. + +```bash + +$ bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +> Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offload will not until the current segment is full. + +## Configuring read priority for offloaded messages + +By default, once messages were offloaded to long term storage, brokers will read them from long term storage, but messages still exists in bookkeeper for a period depends on the administrator's configuration. For +messages exists in both bookkeeper and long term storage, if they are preferred to read from bookkeeper, you can use command to change this configuration. + +```bash + +# default value for -orp is tiered-storage-first +$ bin/pulsar-admin namespaces set-offload-policies my-tenant/my-namespace -orp bookkeeper-first +$ bin/pulsar-admin topics set-offload-policies my-tenant/my-namespace/topic1 -orp bookkeeper-first + +``` + +## Triggering offload manually + +Offloading can manually triggered through a REST endpoint on the Pulsar broker. We provide a CLI which will call this rest endpoint for you. + +When triggering offload, you must specify the maximum size, in bytes, of backlog which will be retained locally on the bookkeeper. The offload mechanism will offload segments from the start of the topic backlog until this condition is met. + +```bash + +$ bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 +Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + +``` + +The command to triggers an offload will not wait until the offload operation has completed. To check the status of the offload, use offload-status. + +```bash + +$ bin/pulsar-admin topics offload-status my-tenant/my-namespace/topic1 +Offload is currently running + +``` + +To wait for offload to complete, add the -w flag. + +```bash + +$ bin/pulsar-admin topics offload-status -w my-tenant/my-namespace/topic1 +Offload was a success + +``` + +If there is an error offloading, the error will be propagated to the offload-status command. + +```bash + +$ bin/pulsar-admin topics offload-status persistent://public/default/topic1 +Error in offload +null + +Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-aws.md b/site2/website/versioned_docs/version-2.10.x/deploy-aws.md new file mode 100644 index 0000000000000..5497aadd7865f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-aws.md @@ -0,0 +1,271 @@ +--- +id: deploy-aws +title: Deploying a Pulsar cluster on AWS using Terraform and Ansible +sidebar_label: "Amazon Web Services" +original_id: deploy-aws +--- + +> For instructions on deploying a single Pulsar cluster manually rather than using Terraform and Ansible, see [Deploying a Pulsar cluster on bare metal](deploy-bare-metal.md). For instructions on manually deploying a multi-cluster Pulsar instance, see [Deploying a Pulsar instance on bare metal](deploy-bare-metal-multi-cluster.md). + +One of the easiest ways to get a Pulsar [cluster](reference-terminology.md#cluster) running on [Amazon Web Services](https://aws.amazon.com/) (AWS) is to use the [Terraform](https://terraform.io) infrastructure provisioning tool and the [Ansible](https://www.ansible.com) server automation tool. Terraform can create the resources necessary for running the Pulsar cluster---[EC2](https://aws.amazon.com/ec2/) instances, networking and security infrastructure, etc.---While Ansible can install and run Pulsar on the provisioned resources. + +## Requirements and setup + +In order to install a Pulsar cluster on AWS using Terraform and Ansible, you need to prepare the following things: + +* An [AWS account](https://aws.amazon.com/account/) and the [`aws`](https://aws.amazon.com/cli/) command-line tool +* Python and [pip](https://pip.pypa.io/en/stable/) +* The [`terraform-inventory`](https://github.com/adammck/terraform-inventory) tool, which enables Ansible to use Terraform artifacts + +You also need to make sure that you are currently logged into your AWS account via the `aws` tool: + +```bash + +$ aws configure + +``` + +## Installation + +You can install Ansible on Linux or macOS using pip. + +```bash + +$ pip install ansible + +``` + +You can install Terraform using the instructions [here](https://learn.hashicorp.com/tutorials/terraform/install-cli). + +You also need to have the Terraform and Ansible configuration for Pulsar locally on your machine. You can find them in the [GitHub repository](https://github.com/apache/pulsar) of Pulsar, which you can fetch using Git commands: + +```bash + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/deployment/terraform-ansible/aws + +``` + +## SSH setup + +> If you already have an SSH key and want to use it, you can skip the step of generating an SSH key and update `private_key_file` setting +> in `ansible.cfg` file and `public_key_path` setting in `terraform.tfvars` file. +> +> For example, if you already have a private SSH key in `~/.ssh/pulsar_aws` and a public key in `~/.ssh/pulsar_aws.pub`, +> follow the steps below: +> +> 1. update `ansible.cfg` with following values: +> + +> ```shell +> +> private_key_file=~/.ssh/pulsar_aws +> +> +> ``` + +> +> 2. update `terraform.tfvars` with following values: +> + +> ```shell +> +> public_key_path=~/.ssh/pulsar_aws.pub +> +> +> ``` + + +In order to create the necessary AWS resources using Terraform, you need to create an SSH key. Enter the following commands to create a private SSH key in `~/.ssh/id_rsa` and a public key in `~/.ssh/id_rsa.pub`: + +```bash + +$ ssh-keygen -t rsa + +``` + +Do *not* enter a passphrase (hit **Enter** instead when the prompt comes out). Enter the following command to verify that a key has been created: + +```bash + +$ ls ~/.ssh +id_rsa id_rsa.pub + +``` + +## Create AWS resources using Terraform + +To start building AWS resources with Terraform, you need to install all Terraform dependencies. Enter the following command: + +```bash + +$ terraform init +# This will create a .terraform folder + +``` + +After that, you can apply the default Terraform configuration by entering this command: + +```bash + +$ terraform apply + +``` + +Then you see this prompt below: + +```bash + +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: + +``` + +Type `yes` and hit **Enter**. Applying the configuration could take several minutes. When the configuration applying finishes, you can see `Apply complete!` along with some other information, including the number of resources created. + +### Apply a non-default configuration + +You can apply a non-default Terraform configuration by changing the values in the `terraform.tfvars` file. The following variables are available: + +Variable name | Description | Default +:-------------|:------------|:------- +`public_key_path` | The path of the public key that you have generated. | `~/.ssh/id_rsa.pub` +`region` | The AWS region in which the Pulsar cluster runs | `us-west-2` +`availability_zone` | The AWS availability zone in which the Pulsar cluster runs | `us-west-2a` +`aws_ami` | The [Amazon Machine Image](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) (AMI) that the cluster uses | `ami-9fa343e7` +`num_zookeeper_nodes` | The number of [ZooKeeper](https://zookeeper.apache.org) nodes in the ZooKeeper cluster | 3 +`num_bookie_nodes` | The number of bookies that runs in the cluster | 3 +`num_broker_nodes` | The number of Pulsar brokers that runs in the cluster | 2 +`num_proxy_nodes` | The number of Pulsar proxies that runs in the cluster | 1 +`base_cidr_block` | The root [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) that network assets uses for the cluster | `10.0.0.0/16` +`instance_types` | The EC2 instance types to be used. This variable is a map with two keys: `zookeeper` for the ZooKeeper instances, `bookie` for the BookKeeper bookies and `broker` and `proxy` for Pulsar brokers and bookies | `t2.small` (ZooKeeper), `i3.xlarge` (BookKeeper) and `c5.2xlarge` (Brokers/Proxies) + +### What is installed + +When you run the Ansible playbook, the following AWS resources are used: + +* 9 total [Elastic Compute Cloud](https://aws.amazon.com/ec2) (EC2) instances running the [ami-9fa343e7](https://access.redhat.com/articles/3135091) Amazon Machine Image (AMI), which runs [Red Hat Enterprise Linux (RHEL) 7.4](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/7.4_release_notes/index). By default, that includes: + * 3 small VMs for ZooKeeper ([t3.small](https://www.ec2instances.info/?selected=t3.small) instances) + * 3 larger VMs for BookKeeper [bookies](reference-terminology.md#bookie) ([i3.xlarge](https://www.ec2instances.info/?selected=i3.xlarge) instances) + * 2 larger VMs for Pulsar [brokers](reference-terminology.md#broker) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) + * 1 larger VMs for Pulsar [proxy](reference-terminology.md#proxy) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) +* An EC2 [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) +* A [virtual private cloud](https://aws.amazon.com/vpc/) (VPC) for security +* An [API Gateway](https://aws.amazon.com/api-gateway/) for connections from the outside world +* A [route table](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Route_Tables.html) for the Pulsar cluster's VPC +* A [subnet](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) for the VPC + +All EC2 instances for the cluster run in the [us-west-2](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) region. + +### Fetch your Pulsar connection URL + +When you apply the Terraform configuration by entering the command `terraform apply`, Terraform outputs a value for the `pulsar_service_url`. The value should look something like this: + +``` + +pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650 + +``` + +You can fetch that value at any time by entering the command `terraform output pulsar_service_url` or parsing the `terraform.tstate` file (which is JSON, even though the filename does not reflect that): + +```bash + +$ cat terraform.tfstate | jq .modules[0].outputs.pulsar_service_url.value + +``` + +### Destroy your cluster + +At any point, you can destroy all AWS resources associated with your cluster using Terraform's `destroy` command: + +```bash + +$ terraform destroy + +``` + +## Setup Disks + +Before you run the Pulsar playbook, you need to mount the disks to the correct directories on those bookie nodes. Since different type of machines have different disk layout, you need to update the task defined in `setup-disk.yaml` file after changing the `instance_types` in your terraform config, + +To setup disks on bookie nodes, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + setup-disk.yaml + +``` + +After that, the disks is mounted under `/mnt/journal` as journal disk, and `/mnt/storage` as ledger disk. +Remember to enter this command just only once. If you attempt to enter this command again after you have run Pulsar playbook, your disks might potentially be erased again, causing the bookies to fail to start up. + +## Run the Pulsar playbook + +Once you have created the necessary AWS resources using Terraform, you can install and run Pulsar on the Terraform-created EC2 instances using Ansible. + +(Optional) If you want to use any [built-in IO connectors](io-connectors.md), edit the `Download Pulsar IO packages` task in the `deploy-pulsar.yaml` file and uncomment the connectors you want to use. + +To run the playbook, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + ../deploy-pulsar.yaml + +``` + +If you have created a private SSH key at a location different from `~/.ssh/id_rsa`, you can specify the different location using the `--private-key` flag in the following command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + --private-key="~/.ssh/some-non-default-key" \ + ../deploy-pulsar.yaml + +``` + +## Access the cluster + +You can now access your running Pulsar using the unique Pulsar connection URL for your cluster, which you can obtain following the instructions [above](#fetching-your-pulsar-connection-url). + +For a quick demonstration of accessing the cluster, we can use the Python client for Pulsar and the Python shell. First, install the Pulsar Python module using pip: + +```bash + +$ pip install pulsar-client + +``` + +Now, open up the Python shell using the `python` command: + +```bash + +$ python + +``` + +Once you are in the shell, enter the following command: + +```python + +>>> import pulsar +>>> client = pulsar.Client('pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650') +# Make sure to use your connection URL +>>> producer = client.create_producer('persistent://public/default/test-topic') +>>> producer.send('Hello world') +>>> client.close() + +``` + +If all of these commands are successful, Pulsar clients can now use your cluster! diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal-multi-cluster.md b/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal-multi-cluster.md new file mode 100644 index 0000000000000..9ac1a85580ffa --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal-multi-cluster.md @@ -0,0 +1,452 @@ +--- +id: deploy-bare-metal-multi-cluster +title: Deploying a multi-cluster on bare metal +sidebar_label: "Bare metal multi-cluster" +original_id: deploy-bare-metal-multi-cluster +--- + +:::tip + +1. You can use single-cluster Pulsar installation in most use cases, such as experimenting with Pulsar or using Pulsar in a startup or in a single team. If you need to run a multi-cluster Pulsar instance, see the [guide](deploy-bare-metal-multi-cluster.md). +2. If you want to use all built-in [Pulsar IO](io-overview.md) connectors, you need to download `apache-pulsar-io-connectors`package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders`package and install `apache-pulsar-offloaders` under `offloaders` directory in the Pulsar directory on every broker node. For more details of how to configure this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +A Pulsar instance consists of multiple Pulsar clusters working in unison. You can distribute clusters across data centers or geographical regions and replicate the clusters amongst themselves using [geo-replication](administration-geo.md). Deploying a multi-cluster Pulsar instance consists of the following steps: + +1. Deploying two separate ZooKeeper quorums: a local quorum for each cluster in the instance and a configuration store quorum for instance-wide tasks +2. Initializing cluster metadata for each cluster +3. Deploying a BookKeeper cluster of bookies in each Pulsar cluster +4. Deploying brokers in each Pulsar cluster + + +> #### Run Pulsar locally or on Kubernetes? +> This guide shows you how to deploy Pulsar in production in a non-Kubernetes environment. If you want to run a standalone Pulsar cluster on a single machine for development purposes, see the [Setting up a local cluster](getting-started-standalone.md) guide. If you want to run Pulsar on [Kubernetes](https://kubernetes.io), see the [Pulsar on Kubernetes](deploy-kubernetes.md) guide, which includes sections on running Pulsar on Kubernetes, on Google Kubernetes Engine and on Amazon Web Services. + +## System requirement + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. You need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +## Install Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar @pulsar:version@ binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=pulsar/pulsar-@pulsar:version@/apache-pulsar-@pulsar:version@-bin.tar.gz' -O apache-pulsar-@pulsar:version@-bin.tar.gz + + ``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | [Command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`licenses` | License files, in `.txt` form, for various components of the Pulsar codebase + +The following directories are created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory that ZooKeeper and BookKeeper use +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs that the installation creates + + +## Deploy ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* Local ZooKeeper operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs a dedicated ZooKeeper cluster. +* Configuration Store operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +You can use an independent cluster of machines or the same machines used by local ZooKeeper to provide the configuration store quorum. + + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +You need to stand up one local ZooKeeper cluster per Pulsar cluster for deploying a Pulsar instance. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the ID of the node in the `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +:::tip + +See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +::: + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command looks like `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy the configuration store + +The ZooKeeper cluster configured and started up in the section above is a local ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a single-cluster instance, you do not need a separate cluster for the configuration store. If, however, you deploy a multi-cluster instance, you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum. You need to use the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions, and other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also assume, each cluster has its own local ZK servers named such as the following: + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario if you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This method guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have the following parameters: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +## Cluster metadata initialization + +Once you set up the cluster-specific ZooKeeper and configuration store quorums for your instance, you need to write some metadata to ZooKeeper for each cluster in your instance. **you only need to write these metadata once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --configuration-metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +As you can see from the example above, you need to specify the following: + +* The name of the cluster +* The local metadata store connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. + +Make sure to run `initialize-cluster-metadata` for each cluster in your instance. + +## Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configure bookies + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the local ZooKeeper of Pulsar cluster. + +### Start bookies + +You can start a bookie in two ways: in the foreground or as a background daemon. + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify that the bookie works properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```bash + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates a new ledger on the local bookie, writes a few entries, reads them back and finally deletes the ledger. + +After you have started all bookies, you can use the `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify that all bookies in the cluster are running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, having a suitable hardware configuration is essential for the bookies. The following are key dimensions for bookie hardware capacity. + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, having fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts is critical. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers acknowledge the message. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + + + +## Deploy brokers + +Once you set up ZooKeeper, initialize cluster metadata, and spin up BookKeeper bookies, you can deploy brokers. + +### Broker configuration + +You can configure brokers using the [`conf/broker.conf`](reference-configuration.md#broker) configuration file. + +The most important element of broker configuration is ensuring that each broker is aware of its local ZooKeeper quorum as well as the configuration store quorum. Make sure that you set the [`metadataStoreUrl`](reference-configuration.md#broker) parameter to reflect the local quorum and the [`configurationMetadataStoreUrl`](reference-configuration.md#broker) parameter to reflect the configuration store quorum (although you need to specify only those ZooKeeper servers located in the same cluster). + +You also need to specify the name of the [cluster](reference-terminology.md#cluster) to which the broker belongs using the [`clusterName`](reference-configuration.md#broker-clusterName) parameter. In addition, you need to match the broker and web service ports provided when you initialize the metadata (especially when you use a different port from default) of the cluster. + +The following is an example configuration: + +```properties + +# Local ZooKeeper servers +metadataStoreUrl=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Configuration store quorum connection string. +configurationMetadataStoreUrl=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +clusterName=us-west + +# Broker data port +brokerServicePort=6650 + +# Broker data port for TLS +brokerServicePortTls=6651 + +# Port to use to server HTTP request +webServicePort=8080 + +# Port to use to server HTTPS request +webServicePortTls=8443 + +``` + +### Broker hardware + +Pulsar brokers do not require any special hardware since they do not use the local disk. You had better choose fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) so that the software can take full advantage of that. + +### Start the broker service + +You can start a broker in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start broker + +``` + +You can also start brokers in the foreground by using [`pulsar broker`](reference-cli-tools.md#broker): + +```shell + +$ bin/pulsar broker + +``` + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to communicate with an entire Pulsar instance using a single URL. + +You can use your own service discovery system. If you use your own system, you only need to satisfy just one requirement: when a client performs an HTTP request to an [endpoint](reference-configuration.md) for a Pulsar cluster, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to some active brokers in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +> **Service discovery already provided by many scheduling systems** +> Many large-scale deployment systems, such as [Kubernetes](deploy-kubernetes.md), have service discovery systems built in. If you run Pulsar on such a system, you may not need to provide your own service discovery mechanism. + +## Admin client and verification + +At this point your Pulsar instance should be ready to use. You can now configure client machines that can serve as [administrative clients](admin-api-overview.md) for each cluster. You can use the [`conf/client.conf`](reference-configuration.md#client) configuration file to configure admin clients. + +The most important thing is that you point the [`serviceUrl`](reference-configuration.md#client-serviceUrl) parameter to the correct service URL for the cluster: + +```properties + +serviceUrl=http://pulsar.us-west.example.com:8080/ + +``` + +## Provision new tenants + +Pulsar is built as a fundamentally multi-tenant system. + + +If a new tenant wants to use the system, you need to create a new one. You can create a new tenant by using the [`pulsar-admin`](reference-pulsar-admin.md#tenants) CLI tool: + +```shell + +$ bin/pulsar-admin tenants create test-tenant \ + --allowed-clusters us-west \ + --admin-roles test-admin-role + +``` + +In this command, users who identify with `test-admin-role` role can administer the configuration for the `test-tenant` tenant. The `test-tenant` tenant can only use the `us-west` cluster. From now on, this tenant can manage its resources. + +Once you create a tenant, you need to create [namespaces](reference-terminology.md#namespace) for topics within that tenant. + + +The first step is to create a namespace. A namespace is an administrative unit that can contain many topics. A common practice is to create a namespace for each different use case from a single tenant. + +```shell + +$ bin/pulsar-admin namespaces create test-tenant/ns1 + +``` + +##### Test producer and consumer + + +Everything is now ready to send and receive messages. The quickest way to test the system is through the [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) client tool. + + +You can use a topic in the namespace that you have just created. Topics are automatically created the first time when a producer or a consumer tries to use them. + +The topic name in this case could be: + +```http + +persistent://test-tenant/ns1/my-topic + +``` + +Start a consumer that creates a subscription on the topic and waits for messages: + +```shell + +$ bin/pulsar-perf consume persistent://test-tenant/ns1/my-topic + +``` + +Start a producer that publishes messages at a fixed rate and reports stats every 10 seconds: + +```shell + +$ bin/pulsar-perf produce persistent://test-tenant/ns1/my-topic + +``` + +To report the topic stats: + +```shell + +$ bin/pulsar-admin topics stats persistent://test-tenant/ns1/my-topic + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal.md b/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal.md new file mode 100644 index 0000000000000..25dc04458613b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-bare-metal.md @@ -0,0 +1,568 @@ +--- +id: deploy-bare-metal +title: Deploy a cluster on bare metal +sidebar_label: "Bare metal" +original_id: deploy-bare-metal +--- + +:::tip + +1. You can use single-cluster Pulsar installation in most use cases, such as experimenting with Pulsar or using Pulsar in a startup or in a single team. If you need to run a multi-cluster Pulsar instance, see the [guide](deploy-bare-metal-multi-cluster.md). +2. If you want to use all built-in [Pulsar IO](io-overview.md) connectors, you need to download `apache-pulsar-io-connectors`package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders`package and install `apache-pulsar-offloaders` under `offloaders` directory in the Pulsar directory on every broker node. For more details of how to configure this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +Deploying a Pulsar cluster consists of the following steps: + +1. Deploy a [ZooKeeper](#deploy-a-zookeeper-cluster) cluster (optional) +2. Initialize [cluster metadata](#initialize-cluster-metadata) +3. Deploy a [BookKeeper](#deploy-a-bookkeeper-cluster) cluster +4. Deploy one or more Pulsar [brokers](#deploy-pulsar-brokers) + +## Preparation + +### Requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +You can reuse existing Zookeeper clusters. + +::: + +To run Pulsar on bare metal, the following configuration is recommended: + +* At least 6 Linux machines or VMs + * 3 for running [ZooKeeper](https://zookeeper.apache.org) + * 3 for running a Pulsar broker, and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +:::note + +* Broker is only supported on 64-bit JVM. +* If you do not have enough machines, or you want to test Pulsar in cluster mode (and expand the cluster later), You can fully deploy Pulsar on a node on which ZooKeeper, bookie and broker run. +* If you do not have a DNS server, you can use the multi-host format in the service URL instead. + +::: + +Each machine in your cluster needs to have [Java 8](https://adoptium.net/?variant=openjdk8) or [Java 11](https://adoptium.net/?variant=openjdk11) installed. + +The following is a diagram showing the basic setup: + +![alt-text](/assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to communicate with the Pulsar cluster using a single URL. In this case, `pulsar-cluster.acme.com` abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +If you deploy a Pulsar cluster, keep in mind the following basic better choices when you do the capacity planning. + +#### ZooKeeper + +For machines running ZooKeeper, it is recommended to use less powerful machines or VMs. Pulsar uses ZooKeeper only for periodic coordination-related and configuration-related tasks, not for basic operations. If you run Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance might likely suffice. + +#### Bookies and Brokers + +For machines running a bookie and a Pulsar broker, more powerful machines are required. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines you can use the following: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +#### Hardware recommendations + +To start a Pulsar instance, below are the minimum and the recommended hardware settings. + +A cluster consists of 3 broker nodes, 3 bookie nodes, and 3 ZooKeeper nodes. The following recommendation is suitable for one node. + +- The minimum hardware settings (**250 Pulsar topics**) + + Component | CPU|Memory|Storage|Throughput |Rate + |---|---|---|---|---|--- + Broker|0.2|256 MB|/|Write throughput: 3 MB/s

    Read throughput: 6 MB/s

    |Write rate: 350 entries/s

    Read rate: 650 entries/s + Bookie|0.2|256 MB|Journal: 8 GB

    PD-SSDLedger: 16 GB, PD-STANDARD|Write throughput: 2 MB/s

    Read throughput: 2 MB/s

    |Write rate: 200 entries/s

    Read rate: 200 entries/s + ZooKeeper|0.05|256 MB|Log: 8 GB, PD-SSD

    Data: 2 GB, PD-STANDARD|/|/ + +- The recommended hardware settings (**1000 Pulsar topics**) + + Component | CPU|Memory|Storage|Throughput |Rate + |---|---|---|---|---|--- + Broker|8|8 GB|/|Write throughput: 100 MB/s

    Read throughput: 200 MB/s

    |Write rate: 10,000 entries/s

    Read rate: 20,000 entries/s + Bookie|4|8GB|Journal: 256 GB

    PD-SSDLedger: 2 TB, PD-STANDARD|Write throughput: 75 MB/s

    Read throughput: 75 MB/s

    |Write rate: 7,500 entries/s

    Read rate: 7,500 entries/s + ZooKeeper|1|2 GB|Log: 64 GB, PD-SSD

    Data: 256 GB, PD-STANDARD|/|/ + +## Install the Pulsar binary package + +> You need to install the Pulsar binary package on each machine in the cluster, including machines running ZooKeeper and BookKeeper. + +To get started deploying a Pulsar cluster on bare metal, you need to download a binary tarball release in one of the following ways: + +* By clicking on the link below directly, which automatically triggers a download: + * Pulsar @pulsar:version@ binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) on GitHub +* Using [wget](https://www.gnu.org/software/wget): + +```bash + +$ wget pulsar:binary_release_url + +``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvzf apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +The extracted directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` |[command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory that ZooKeeper and BookKeeper use +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`logs` | Logs that the installation creates + +## [Install Builtin Connectors (optional)](standalone.md#install-builtin-connectors-optional) + +> Since Pulsar release `2.1.0-incubating`, Pulsar provides a separate binary distribution, containing all the `builtin` connectors. +> To enable the `builtin` connectors (optional), you can follow the instructions below. + +To use `builtin` connectors, you need to download the connectors tarball release on every broker node in one of the following ways : + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar IO Connectors @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +Once you download the .nar file, copy the file to directory `connectors` in the pulsar directory. +For example, if you download the connector file `pulsar-io-aerospike-@pulsar:version@.nar`: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +## [Install Tiered Storage Offloaders (optional)](standalone.md#install-tiered-storage-offloaders-optional) + +> Since Pulsar release `2.2.0`, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> If you want to enable tiered storage feature, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To use tiered storage offloaders, you need to download the offloaders tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +Once you download the tarball, in the Pulsar directory, untar the offloaders package and copy the offloaders as `offloaders` in the Pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you can find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more details of how to configure tiered storage feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md) + + +## Deploy a ZooKeeper cluster + +> If you already have an existing zookeeper cluster and want to use it, you can skip this section. + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination-related and configuration-related tasks for Pulsar. To deploy a Pulsar cluster, you need to deploy ZooKeeper first. A 3-node ZooKeeper cluster is the recommended configuration. Pulsar does not make heavy use of ZooKeeper, so the lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory that you create [above](#install-the-pulsar-binary-package)). The following is an example: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> If you only have one machine on which to deploy Pulsar, you only need to add one server entry in the configuration file. + +> If your machines are behind NAT use 0.0.0.0 as server entry for the local address. If the node use external IP in configuration for itself, behind NAT, zookeper service won't start because it tries to put a listener on an external ip that the linux box doesn't own. Using 0.0.0.0 start a listener on ALL ip, so that NAT network traffic can reach it. + +Example of configuration on _server.3_ + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=0.0.0.0:2888:3888 + +``` + +On each host, you need to specify the ID of the node in the `myid` file, which is in the `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +For example, on a ZooKeeper server like `zk1.us-west.example.com`, you can set the `myid` value as follows: + +```bash + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com`, the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and have the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start zookeeper + +``` + +> If you plan to deploy Zookeeper with the Bookie on the same node, you need to start zookeeper by using different stats +> port by configuring the `metricsProvider.httpPort` in zookeeper.conf. + +## Initialize cluster metadata + +Once you deploy ZooKeeper for your cluster, you need to write some metadata to ZooKeeper. You only need to write this data **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your Pulsar cluster, so the metadata can be initialized from a ZooKeeper, broker, or bookie machine. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --configuration-metadata-store zk:zk1.us-west.example.com:2181,zk2.us-west.example.com:2181/my-chroot-path \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +As you can see from the example above, you will need to specify the following: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--metadata-store` | A "local" metadata store connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-metadata-store` | The configuration metadata store connection string for the entire instance. As with the `--metadata-store` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (you had better not use a different port). +`--web-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster. The default port is 8443 (you had better not use a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should not use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (you had better not use a different port). +`--broker-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (you had better not use a different port). + + +> If you do not have a DNS server, you can use multi-host format in the service URL with the following settings: +> + +> ```shell +> +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> +> If you want to use an existing BookKeeper cluster, you can add the `--existing-bk-metadata-service-uri` flag as follows: +> + +> ```shell +> +> --existing-bk-metadata-service-uri "zk+null://zk1:2181;zk2:2181/ledgers" \ +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> You can obtain the metadata service URI of the existing BookKeeper cluster by using the `bin/bookkeeper shell whatisinstanceid` command. You must enclose the value in double quotes since the multiple metadata service URIs are separated with semicolons. + +## Deploy a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You need to deploy a cluster of BookKeeper bookies to use Pulsar. You can choose to run a **3-bookie BookKeeper cluster**. + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. The following is an example: + +```properties + +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +Once you appropriately modify the `zkServers` parameter, you can make any other configuration changes that you require. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper). However, consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide might be a better choice. + +Once you apply the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +To start the bookie in the foreground: + +```bash + +$ bin/pulsar bookie + +``` + +You can verify that a bookie works properly by running the `bookiesanity` command on the [BookKeeper shell](reference-cli-tools.md#shell): + +```bash + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates an ephemeral BookKeeper ledger on the local bookie, writes a few entries, reads them back, and finally deletes the ledger. + +After you start all the bookies, you can use `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify all the bookies in the cluster are up running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +This command creates a `num-bookies` sized ledger on the cluster, writes a few entries, and finally deletes the ledger. + + +## Deploy Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide the administrative interface of Pulsar. A good choice is to run **3 brokers**, one for each machine that already runs a BookKeeper bookie. + +### Configure Brokers + +The most important element of broker configuration is ensuring that each broker is aware of the ZooKeeper cluster that you have deployed. Ensure that the [`metadataStoreUrl`](reference-configuration.md#broker) and [`configurationMetadataStoreUrl`](reference-configuration.md#broker) parameters are correct. In this case, since you only have 1 cluster and no configuration store setup, the `configurationMetadataStoreUrl` point to the same `metadataStoreUrl`. + +```properties + +metadataStoreUrl=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationMetadataStoreUrl=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +You also need to specify the cluster name (matching the name that you provided when you [initialize the metadata of the cluster](#initialize-cluster-metadata)): + +```properties + +clusterName=pulsar-cluster-1 + +``` + +In addition, you need to match the broker and web service ports provided when you initialize the metadata of the cluster (especially when you use a different port than the default): + +```properties + +brokerServicePort=6650 +brokerServicePortTls=6651 +webServicePort=8080 +webServicePortTls=8443 + +``` + +> If you deploy Pulsar in a one-node cluster, you should update the replication settings in `conf/broker.conf` to `1`. +> + +> ```properties +> +> # Number of bookies to use when creating a ledger +> managedLedgerDefaultEnsembleSize=1 +> +> # Number of copies to store for each message +> managedLedgerDefaultWriteQuorum=1 +> +> # Number of guaranteed copies (acks to wait before write is complete) +> managedLedgerDefaultAckQuorum=1 +> +> +> ``` + + +### Enable Pulsar Functions (optional) + +If you want to enable [Pulsar Functions](functions-overview.md), you can follow the instructions as below: + +1. Edit `conf/broker.conf` to enable functions worker, by setting `functionsWorkerEnabled` to `true`. + + ```conf + + functionsWorkerEnabled=true + + ``` + +2. Edit `conf/functions_worker.yml` and set `pulsarFunctionsCluster` to the cluster name that you provide when you [initialize the metadata of the cluster](#initialize-cluster-metadata). + + ```conf + + pulsarFunctionsCluster: pulsar-cluster-1 + + ``` + +If you want to learn more options about deploying the functions worker, check out [Deploy and manage functions worker](functions-worker.md). + +### Start Brokers + +You can then provide any other configuration changes that you want in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you decide on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, you can start brokers either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash + +$ bin/pulsar broker + +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start broker + +``` + +Once you successfully start up all the brokers that you intend to use, your Pulsar cluster should be ready to go! + +## Connect to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provide a simple way to make sure that your cluster runs properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you assign to your broker/bookie hosts. The following is an example: + +```properties + +webServiceUrl=http://us-west.example.com:8080 +brokerServiceurl=pulsar://us-west.example.com:6650 + +``` + +> If you do not have a DNS server, you can specify multi-host in service URL as follows: +> + +> ```properties +> +> webServiceUrl=http://host1:8080,host2:8080,host3:8080 +> brokerServiceurl=pulsar://host1:6650,host2:6650,host3:6650 +> +> +> ``` + + +Once that is complete, you can publish a message to the Pulsar topic: + +```bash + +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello Pulsar" + +``` + +> You may need to use a different cluster name in the topic if you specify a cluster name other than `pulsar-cluster-1`. + +This command publishes a single message to the Pulsar topic. In addition, you can subscribe to the Pulsar topic in a different terminal before publishing messages as below: + +```bash + +$ bin/pulsar-client consume \ + persistent://public/default/test \ + -n 100 \ + -s "consumer-test" \ + -t "Exclusive" + +``` + +Once you successfully publish the above message to the topic, you should see it in the standard output: + +```bash + +----- got message ----- +Hello Pulsar + +``` + +## Run Functions + +> If you have [enabled](#enable-pulsar-functions-optional) Pulsar Functions, you can try out the Pulsar Functions now. + +Create an ExclamationFunction `exclamation`. + +```bash + +bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation + +``` + +Check whether the function runs as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. + +```bash + +bin/pulsar-admin functions trigger --name exclamation --trigger-value "hello world" + +``` + +You should see the following output: + +```shell + +hello world! + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-dcos.md b/site2/website/versioned_docs/version-2.10.x/deploy-dcos.md new file mode 100644 index 0000000000000..35a0a83d716ad --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-dcos.md @@ -0,0 +1,200 @@ +--- +id: deploy-dcos +title: Deploy Pulsar on DC/OS +sidebar_label: "DC/OS" +original_id: deploy-dcos +--- + +:::tip + +To enable all built-in [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, we recommend you use `apachepulsar/pulsar-all` image instead of `apachepulsar/pulsar` image; the former has already bundled [all built-in connectors](io-overview.md#working-with-connectors). + +::: + +[DC/OS](https://dcos.io/) (the DataCenter Operating System) is a distributed operating system for deploying and managing applications and systems on [Apache Mesos](http://mesos.apache.org/). DC/OS is an open-source tool created and maintained by [Mesosphere](https://mesosphere.com/). + +Apache Pulsar is available as a [Marathon Application Group](https://mesosphere.github.io/marathon/docs/application-groups.html), which runs multiple applications as manageable sets. + +## Prerequisites + +You need to prepare your environment before running Pulsar on DC/OS. + +* DC/OS version [1.9](https://docs.mesosphere.com/1.9/) or higher +* A [DC/OS cluster](https://docs.mesosphere.com/1.9/installing/) with at least three agent nodes +* The [DC/OS CLI tool](https://docs.mesosphere.com/1.9/cli/install/) installed +* The [`PulsarGroups.json`](https://github.com/apache/pulsar/blob/master/deployment/dcos/PulsarGroups.json) configuration file from the Pulsar GitHub repo. + + ```bash + + $ curl -O https://raw.githubusercontent.com/apache/pulsar/master/deployment/dcos/PulsarGroups.json + + ``` + +Each node in the DC/OS-managed Mesos cluster must have at least: + +* 4 CPU +* 4 GB of memory +* 60 GB of total persistent disk + +Alternatively, you can change the configuration in `PulsarGroups.json` accordingly to match your resources of the DC/OS cluster. + +## Deploy Pulsar using the DC/OS command interface + +You can deploy Pulsar on DC/OS using this command: + +```bash + +$ dcos marathon group add PulsarGroups.json + +``` + +This command deploys Docker container instances in three groups, which together comprise a Pulsar cluster: + +* 3 bookies (1 [bookie](reference-terminology.md#bookie) on each agent node and 1 [bookie recovery](http://bookkeeper.apache.org/docs/latest/admin/autorecovery/) instance) +* 3 Pulsar [brokers](reference-terminology.md#broker) (1 broker on each node and 1 admin instance) +* 1 [Prometheus](http://prometheus.io/) instance and 1 [Grafana](https://grafana.com/) instance + + +> When you run DC/OS, a ZooKeeper cluster will be running at `master.mesos:2181`, thus you do not have to install or start up ZooKeeper separately. + +After executing the `dcos` command above, click the **Services** tab in the DC/OS [GUI interface](https://docs.mesosphere.com/latest/gui/), which you can access at [http://m1.dcos](http://m1.dcos) in this example. You should see several applications during the deployment. + +![DC/OS command executed](/assets/dcos_command_execute.png) + +![DC/OS command executed2](/assets/dcos_command_execute2.png) + +## The BookKeeper group + +To monitor the status of the BookKeeper cluster deployment, click the **bookkeeper** group in the parent **pulsar** group. + +![DC/OS bookkeeper status](/assets/dcos_bookkeeper_status.png) + +At this point, the status of the 3 [bookies](reference-terminology.md#bookie) are green, which means that the bookies have been deployed successfully and are running. + +![DC/OS bookkeeper running](/assets/dcos_bookkeeper_run.png) + +You can also click each bookie instance to get more detailed information, such as the bookie running log. + +![DC/OS bookie log](/assets/dcos_bookie_log.png) + +To display information about the BookKeeper in ZooKeeper, you can visit [http://m1.dcos/exhibitor](http://m1.dcos/exhibitor). In this example, 3 bookies are under the `available` directory. + +![DC/OS bookkeeper in zk](/assets/dcos_bookkeeper_in_zookeeper.png) + +## The Pulsar broker group + +Similar to the BookKeeper group above, click **brokers** to check the status of the Pulsar brokers. + +![DC/OS broker status](/assets/dcos_broker_status.png) + +![DC/OS broker running](/assets/dcos_broker_run.png) + +You can also click each broker instance to get more detailed information, such as the broker running log. + +![DC/OS broker log](/assets/dcos_broker_log.png) + +Broker cluster information in ZooKeeper is also available through the web UI. In this example, you can see that the `loadbalance` and `managed-ledgers` directories have been created. + +![DC/OS broker in zk](/assets/dcos_broker_in_zookeeper.png) + +## Monitor group + +The **monitory** group consists of Prometheus and Grafana. + +![DC/OS monitor status](/assets/dcos_monitor_status.png) + +### Prometheus + +Click the instance of `prom` to get the endpoint of Prometheus, which is `192.168.65.121:9090` in this example. + +![DC/OS prom endpoint](/assets/dcos_prom_endpoint.png) + +If you click that endpoint, you can see the Prometheus dashboard. All the bookies and brokers are listed on [http://192.168.65.121:9090/targets](http://192.168.65.121:9090/targets). + +![DC/OS prom targets](/assets/dcos_prom_targets.png) + +### Grafana + +Click `grafana` to get the endpoint for Grafana, which is `192.168.65.121:3000` in this example. + +![DC/OS grafana endpoint](/assets/dcos_grafana_endpoint.png) + +If you click that endpoint, you can access the Grafana dashboard. + +![DC/OS grafana targets](/assets/dcos_grafana_dashboard.png) + +## Run a simple Pulsar consumer and producer on DC/OS + +Now that you have a fully deployed Pulsar cluster, you can run a simple consumer and producer to show Pulsar on DC/OS in action. + +### Download and prepare the Pulsar Java tutorial + +You can clone a [Pulsar Java tutorial](https://github.com/streamlio/pulsar-java-tutorial) repo. This repo contains a simple Pulsar consumer and producer (you can find more information in the `README` file in this repo). + +```bash + +$ git clone https://github.com/streamlio/pulsar-java-tutorial + +``` + +Change the `SERVICE_URL` from `pulsar://localhost:6650` to `pulsar://a1.dcos:6650` in both [`ConsumerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ConsumerTutorial.java) file and [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) file. + +The `pulsar://a1.dcos:6650` endpoint is for the broker service. You can fetch the endpoint details for each broker instance from the DC/OS GUI. `a1.dcos` is a DC/OS client agent that runs a broker, and you can replace it with the client agent IP address. + +Now, you can change the message number from 10 to 10000000 in the main method in [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) file to produce more messages. + +Then, you can compile the project code using the command below: + +```bash + +$ mvn clean package + +``` + +### Run the consumer and producer + +Execute this command to run the consumer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ConsumerTutorial" + +``` + +Execute this command to run the producer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ProducerTutorial" + +``` + +You see that the producer is producing messages and the consumer is consuming messages through the DC/OS GUI. + +![DC/OS pulsar producer](/assets/dcos_producer.png) + +![DC/OS pulsar consumer](/assets/dcos_consumer.png) + +### View Grafana metric output + +While the producer and consumer are running, you can access the running metrics from Grafana. + +![DC/OS pulsar dashboard](/assets/dcos_metrics.png) + + +## Uninstall Pulsar + +You can shut down and uninstall the `pulsar` application from DC/OS at any time in one of the following two ways: + +1. Click the three dots at the right end of Pulsar group and choose **Delete** on the DC/OS GUI. + + ![DC/OS pulsar uninstall](/assets/dcos_uninstall.png) + +2. Use the command below. + + ```bash + + $ dcos marathon group remove /pulsar + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-docker.md b/site2/website/versioned_docs/version-2.10.x/deploy-docker.md new file mode 100644 index 0000000000000..8348d78deb237 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-docker.md @@ -0,0 +1,60 @@ +--- +id: deploy-docker +title: Deploy a cluster on Docker +sidebar_label: "Docker" +original_id: deploy-docker +--- + +To deploy a Pulsar cluster on Docker, complete the following steps: +1. Deploy a ZooKeeper cluster (optional) +2. Initialize cluster metadata +3. Deploy a BookKeeper cluster +4. Deploy one or more Pulsar brokers + +## Prepare + +To run Pulsar on Docker, you need to create a container for each Pulsar component: ZooKeeper, BookKeeper and broker. You can pull the images of ZooKeeper and BookKeeper separately on [Docker Hub](https://hub.docker.com/), and pull a [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) for the broker. You can also pull only one [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) and create three containers with this image. This tutorial takes the second option as an example. + +### Pull a Pulsar image +You can pull a Pulsar image from [Docker Hub](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) with the following command. + +``` + +docker pull apachepulsar/pulsar-all:latest + +``` + +### Create three containers +Create containers for ZooKeeper, BookKeeper and broker. In this example, they are named as `zookeeper`, `bookkeeper` and `broker` respectively. You can name them as you want with the `--name` flag. By default, the container names are created randomly. + +``` + +docker run -it --name bookkeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name zookeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name broker apachepulsar/pulsar-all:latest /bin/bash + +``` + +### Create a network +To deploy a Pulsar cluster on Docker, you need to create a `network` and connect the containers of ZooKeeper, BookKeeper and broker to this network. The following command creates the network `pulsar`: + +``` + +docker network create pulsar + +``` + +### Connect containers to network +Connect the containers of ZooKeeper, BookKeeper and broker to the `pulsar` network with the following commands. + +``` + +docker network connect pulsar zookeeper +docker network connect pulsar bookkeeper +docker network connect pulsar broker + +``` + +To check whether the containers are successfully connected to the network, enter the `docker network inspect pulsar` command. + +For detailed information about how to deploy ZooKeeper cluster, BookKeeper cluster, brokers, see [deploy a cluster on bare metal](deploy-bare-metal.md). diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-kubernetes.md b/site2/website/versioned_docs/version-2.10.x/deploy-kubernetes.md new file mode 100644 index 0000000000000..1aefc6ad79f71 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-kubernetes.md @@ -0,0 +1,11 @@ +--- +id: deploy-kubernetes +title: Deploy Pulsar on Kubernetes +sidebar_label: "Kubernetes" +original_id: deploy-kubernetes +--- + +To get up and running with these charts as fast as possible, in a **non-production** use case, we provide +a [quick start guide](getting-started-helm.md) for Proof of Concept (PoC) deployments. + +To configure and install a Pulsar cluster on Kubernetes for production usage, follow the complete [Installation Guide](helm-install.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/deploy-monitoring.md b/site2/website/versioned_docs/version-2.10.x/deploy-monitoring.md new file mode 100644 index 0000000000000..69d994b7d586f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/deploy-monitoring.md @@ -0,0 +1,138 @@ +--- +id: deploy-monitoring +title: Monitor +sidebar_label: "Monitor" +original_id: deploy-monitoring +--- + +You can use different ways to monitor a Pulsar cluster, exposing both metrics related to the usage of topics and the overall health of the individual components of the cluster. + +## Collect metrics + +You can collect broker stats, ZooKeeper stats, and BookKeeper stats. + +### Broker stats + +You can collect Pulsar broker metrics from brokers and export the metrics in JSON format. The Pulsar broker metrics mainly have two types: + +* *Destination dumps*, which contain stats for each individual topic. You can fetch the destination dumps using the command below: + + ```shell + + bin/pulsar-admin broker-stats destinations + + ``` + +* Broker metrics, which contain the broker information and topics stats aggregated at namespace level. You can fetch the broker metrics by using the following command: + + ```shell + + bin/pulsar-admin broker-stats monitoring-metrics + + ``` + +All the message rates are updated every minute. + +The aggregated broker metrics are also exposed in the [Prometheus](https://prometheus.io) format at: + +```shell + +http://$BROKER_ADDRESS:8080/metrics/ + +``` + +### ZooKeeper stats + +The local ZooKeeper, configuration store server and clients that are shipped with Pulsar can expose detailed stats through Prometheus. + +```shell + +http://$LOCAL_ZK_SERVER:8000/metrics +http://$GLOBAL_ZK_SERVER:8001/metrics + +``` + +The default port of local ZooKeeper is `8000` and the default port of the configuration store is `8001`. You can use a different stats port by configuring `metricsProvider.httpPort` in the `conf/zookeeper.conf` file. + +### BookKeeper stats + +You can configure the stats frameworks for BookKeeper by modifying the `statsProviderClass` in the `conf/bookkeeper.conf` file. + +The default BookKeeper configuration enables the Prometheus exporter. The configuration is included with Pulsar distribution. + +```shell + +http://$BOOKIE_ADDRESS:8000/metrics + +``` + +The default port for bookie is `8000`. You can change the port by configuring `prometheusStatsHttpPort` in the `conf/bookkeeper.conf` file. + +### Managed cursor acknowledgment state +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgement, you can configure the metrics for the managed cursor. + +``` + +brk_ml_cursor_persistLedgerSucceed(namespace=", ledger_name="", cursor_name:") +brk_ml_cursor_persistLedgerErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperSucceed(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_nonContiguousDeletedMessagesRange(namespace="", ledger_name="", cursor_name:"") + +``` + +Those metrics are added in the Prometheus interface, you can monitor and check the metrics stats in the Grafana. + +### Function and connector stats + +You can collect functions worker stats from `functions-worker` and export the metrics in JSON formats, which contain functions worker JVM metrics. + +``` + +pulsar-admin functions-worker monitoring-metrics + +``` + +You can collect functions and connectors metrics from `functions-worker` and export the metrics in JSON formats. + +``` + +pulsar-admin functions-worker function-stats + +``` + +The aggregated functions and connectors metrics can be exposed in Prometheus formats as below. You can get [`FUNCTIONS_WORKER_ADDRESS`](functions-worker.md) and `WORKER_PORT` from the `functions_worker.yml` file. + +``` + +http://$FUNCTIONS_WORKER_ADDRESS:$WORKER_PORT/metrics: + +``` + +## Configure Prometheus + +You can use Prometheus to collect all the metrics exposed for Pulsar components and set up [Grafana](https://grafana.com/) dashboards to display the metrics and monitor your Pulsar cluster. For details, refer to [Prometheus guide](https://prometheus.io/docs/introduction/getting_started/). + +When you run Pulsar on bare metal, you can provide the list of nodes to be probed. When you deploy Pulsar in a Kubernetes cluster, the monitoring is setup automatically. For details, refer to [Kubernetes instructions](helm-deploy.md). + +## Dashboards + +When you collect time series statistics, the major problem is to make sure the number of dimensions attached to the data does not explode. Thus you only need to collect time series of metrics aggregated at the namespace level. + +### Pulsar per-topic dashboard + +The per-topic dashboard instructions are available at [Pulsar manager](administration-pulsar-manager.md). + +### Grafana + +You can use grafana to create dashboard driven by the data that is stored in Prometheus. + +When you deploy Pulsar on Kubernetes with the Pulsar Helm Chart, a `pulsar-grafana` Docker image is enabled by default. You can use the docker image with the principal dashboards. + +The following are some Grafana dashboards examples: + +- [pulsar-grafana](deploy-monitoring.md#grafana): a Grafana dashboard that displays metrics collected in Prometheus for Pulsar clusters running on Kubernetes. +- [apache-pulsar-grafana-dashboard](https://github.com/streamnative/apache-pulsar-grafana-dashboard): a collection of Grafana dashboard templates for different Pulsar components running on both Kubernetes and on-premise machines. + +## Alerting rules +You can set alerting rules according to your Pulsar environment. To configure alerting rules for Apache Pulsar, refer to [alerting rules](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/develop-load-manager.md b/site2/website/versioned_docs/version-2.10.x/develop-load-manager.md new file mode 100644 index 0000000000000..509209b6a852d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/develop-load-manager.md @@ -0,0 +1,227 @@ +--- +id: develop-load-manager +title: Modular load manager +sidebar_label: "Modular load manager" +original_id: develop-load-manager +--- + +The *modular load manager*, implemented in [`ModularLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/ModularLoadManagerImpl.java), is a flexible alternative to the previously implemented load manager, [`SimpleLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/SimpleLoadManagerImpl.java), which attempts to simplify how load is managed while also providing abstractions so that complex load management strategies may be implemented. + +## Usage + +There are two ways that you can enable the modular load manager: + +1. Change the value of the `loadManagerClassName` parameter in `conf/broker.conf` from `org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl` to `org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl`. +2. Using the `pulsar-admin` tool. Here's an example: + + ```shell + + $ pulsar-admin update-dynamic-config \ + --config loadManagerClassName \ + --value org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl + + ``` + + You can use the same method to change back to the original value. In either case, any mistake in specifying the load manager will cause Pulsar to default to `SimpleLoadManagerImpl`. + +## Verification + +There are a few different ways to determine which load manager is being used: + +1. Use `pulsar-admin` to examine the `loadManagerClassName` element: + + ```shell + + $ bin/pulsar-admin brokers get-all-dynamic-config + { + "loadManagerClassName" : "org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl" + } + + ``` + + If there is no `loadManagerClassName` element, then the default load manager is used. + +2. Consult a ZooKeeper load report. With the module load manager, the load report in `/loadbalance/brokers/...` will have many differences. for example the `systemResourceUsage` sub-elements (`bandwidthIn`, `bandwidthOut`, etc.) are now all at the top level. Here is an example load report from the module load manager: + + ```json + + { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 4.256510416666667 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 5.287239583333333 + }, + "bundles": [], + "cpu": { + "limit": 2400.0, + "usage": 5.7353247655435915 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + } + } + + ``` + + With the simple load manager, the load report in `/loadbalance/brokers/...` will look like this: + + ```json + + { + "systemResourceUsage": { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 0.0 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 0.0 + }, + "cpu": { + "limit": 2400.0, + "usage": 0.0 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + }, + "memory": { + "limit": 8192.0, + "usage": 3903.0 + } + } + } + + ``` + +3. The command-line [broker monitor](reference-cli-tools.md#monitor-brokers) will have a different output format depending on which load manager implementation is being used. + + Here is an example from the modular load manager: + + ``` + + =================================================================================================================== + ||SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.00 |48.33 |0.01 |0.00 |0.00 |48.33 || + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |4 |0 || + ||LATEST |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||SHORT |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||LONG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + =================================================================================================================== + + ``` + + Here is an example from the simple load manager: + + ``` + + =================================================================================================================== + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |0 |0 || + ||RAW SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.25 |47.94 |0.01 |0.00 |0.00 |47.94 || + ||ALLOC SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.20 |1.89 | |1.27 |3.21 |3.21 || + ||RAW MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.01 |0.01 |0.01 || + ||ALLOC MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |54.84 |134.48 |189.31 |126.54 |320.96 |447.50 || + =================================================================================================================== + + ``` + +It is important to note that the module load manager is _centralized_, meaning that all requests to assign a bundle---whether it's been seen before or whether this is the first time---only get handled by the _lead_ broker (which can change over time). To determine the current lead broker, examine the `/loadbalance/leader` node in ZooKeeper. + +## Implementation + +### Data + +The data monitored by the modular load manager is contained in the [`LoadData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/LoadData.java) class. +Here, the available data is subdivided into the bundle data and the broker data. + +#### Broker + +The broker data is contained in the [`BrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BrokerData.java) class. It is further subdivided into two parts, +one being the local data which every broker individually writes to ZooKeeper, and the other being the historical broker +data which is written to ZooKeeper by the leader broker. + +##### Local Broker Data +The local broker data is contained in the class [`LocalBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java) and provides information about the following resources: + +* CPU usage +* JVM heap memory usage +* Direct memory usage +* Bandwidth in/out usage +* Most recent total message rate in/out across all bundles +* Total number of topics, bundles, producers, and consumers +* Names of all bundles assigned to this broker +* Most recent changes in bundle assignments for this broker + +The local broker data is updated periodically according to the service configuration +"loadBalancerReportUpdateMaxIntervalMinutes". After any broker updates their local broker data, the leader broker will +receive the update immediately via a ZooKeeper watch, where the local data is read from the ZooKeeper node +`/loadbalance/brokers/` + +##### Historical Broker Data + +The historical broker data is contained in the [`TimeAverageBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/TimeAverageBrokerData.java) class. + +In order to reconcile the need to make good decisions in a steady-state scenario and make reactive decisions in a critical scenario, the historical data is split into two parts: the short-term data for reactive decisions, and the long-term data for steady-state decisions. Both time frames maintain the following information: + +* Message rate in/out for the entire broker +* Message throughput in/out for the entire broker + +Unlike the bundle data, the broker data does not maintain samples for the global broker message rates and throughputs, which is not expected to remain steady as new bundles are removed or added. Instead, this data is aggregated over the short-term and long-term data for the bundles. See the section on bundle data to understand how that data is collected and maintained. + +The historical broker data is updated for each broker in memory by the leader broker whenever any broker writes their local data to ZooKeeper. Then, the historical data is written to ZooKeeper by the leader broker periodically according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +##### Bundle Data + +The bundle data is contained in the [`BundleData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BundleData.java). Like the historical broker data, the bundle data is split into a short-term and a long-term time frame. The information maintained in each time frame: + +* Message rate in/out for this bundle +* Message Throughput In/Out for this bundle +* Current number of samples for this bundle + +The time frames are implemented by maintaining the average of these values over a set, limited number of samples, where +the samples are obtained through the message rate and throughput values in the local data. Thus, if the update interval +for the local data is 2 minutes, the number of short samples is 10 and the number of long samples is 1000, the +short-term data is maintained over a period of `10 samples * 2 minutes / sample = 20 minutes`, while the long-term +data is similarly over a period of 2000 minutes. Whenever there are not enough samples to satisfy a given time frame, +the average is taken only over the existing samples. When no samples are available, default values are assumed until +they are overwritten by the first sample. Currently, the default values are + +* Message rate in/out: 50 messages per second both ways +* Message throughput in/out: 50KB per second both ways + +The bundle data is updated in memory on the leader broker whenever any broker writes their local data to ZooKeeper. +Then, the bundle data is written to ZooKeeper by the leader broker periodically at the same time as the historical +broker data, according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +### Traffic Distribution + +The modular load manager uses the abstraction provided by [`ModularLoadManagerStrategy`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/ModularLoadManagerStrategy.java) to make decisions about bundle assignment. The strategy makes a decision by considering the service configuration, the entire load data, and the bundle data for the bundle to be assigned. Currently, the only supported strategy is [`LeastLongTermMessageRate`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LeastLongTermMessageRate.java), though soon users will have the ability to inject their own strategies if desired. + +#### Least Long Term Message Rate Strategy + +As its name suggests, the least long term message rate strategy attempts to distribute bundles across brokers so that +the message rate in the long-term time window for each broker is roughly the same. However, simply balancing load based +on message rate does not handle the issue of asymmetric resource burden per message on each broker. Thus, the system +resource usages, which are CPU, memory, direct memory, bandwidth in, and bandwidth out, are also considered in the +assignment process. This is done by weighting the final message rate according to +`1 / (overload_threshold - max_usage)`, where `overload_threshold` corresponds to the configuration +`loadBalancerBrokerOverloadedThresholdPercentage` and `max_usage` is the maximum proportion among the system resources +that is being utilized by the candidate broker. This multiplier ensures that machines with are being more heavily taxed +by the same message rates will receive less load. In particular, it tries to ensure that if one machine is overloaded, +then all machines are approximately overloaded. In the case in which a broker's max usage exceeds the overload +threshold, that broker is not considered for bundle assignment. If all brokers are overloaded, the bundle is randomly +assigned. + diff --git a/site2/website/versioned_docs/version-2.10.x/develop-plugin.md b/site2/website/versioned_docs/version-2.10.x/develop-plugin.md new file mode 100644 index 0000000000000..28d8de8ae375d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/develop-plugin.md @@ -0,0 +1,139 @@ +--- +id: develop-plugin +title: Pulsar plugin development +sidebar_label: "Plugin" +original_id: develop-plugin +--- + +You can develop various plugins for Pulsar, such as entry filters, protocol handlers, interceptors, and so on. + +## Entry filter + +This chapter describes what the entry filter is and shows how to use the entry filter. + +### What is an entry filter? + +The entry filter is an extension point for implementing a custom message entry strategy. With an entry filter, you can decide **whether to send messages to consumers** (brokers can use the return values of entry filters to determine whether the messages need to be sent or discarded) or **send messages to specific consumers.** + +To implement features such as tagged messages or custom delayed messages, use [`subscriptionProperties`](https://github.com/apache/pulsar/blob/ec0a44058d249a7510bb3d05685b2ee5e0874eb6/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/ConsumerBuilder.java?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L174), [`​​properties`](https://github.com/apache/pulsar/blob/ec0a44058d249a7510bb3d05685b2ee5e0874eb6/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/ConsumerBuilder.java?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L533), and entry filters. + +### How to use an entry filter? + +Follow the steps below: + +1. Create a Maven project. + +2. Implement the `EntryFilter` interface. + +3. Package the implementation class into a NAR file. + +4. Configure the `broker.conf` file (or the `standalone.conf` file) and restart your broker. + +#### Step 1: Create a Maven project + +For how to create a Maven project, see [here](https://maven.apache.org/guides/getting-started/maven-in-five-minutes.html). + +#### Step 2: Implement the `EntryFilter` interface + +1. Add a dependency for Pulsar broker in the `pom.xml` file to display. Otherwise, you can not find the [`EntryFilter` interface](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/plugin/EntryFilter.java). + + ```xml + + + org.apache.pulsar + pulsar-broker + ${pulsar.version} + provided + + + ``` + +2. Implement the [`FilterResult filterEntry(Entry entry, FilterContext context);` method](https://github.com/apache/pulsar/blob/2adb6661d5b82c5705ee00ce3ebc9941c99635d5/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/plugin/EntryFilter.java#L34). + + - If the method returns `ACCEPT` or NULL, this message is sent to consumers. + + - If the method returns `REJECT`, this message is filtered out and it does not consume message permits. + + - If there are multiple entry filters, this message passes through all filters in the pipeline in a round-robin manner. If any entry filter returns `REJECT`, this message is discarded. + + You can get entry metadata, subscriptions, and other information through `FilterContext`. + +3. Describe a NAR file. + + Create an `entry_filter.yml` file in the `resources/META-INF/services` directory to describe a NAR file. + + ```conf + + # Entry filter name, which should be configured in the broker.conf file later + name: entryFilter + # Entry filter description + description: entry filter + # Implementation class name of entry filter + entryFilterClass: com.xxxx.xxxx.xxxx.DefaultEntryFilterImpl + + ``` + +#### Step 3: package implementation class of entry filter into a NAR file + +1. Add the compiled plugin of the NAR file to your `pom.xml` file. + + ```xml + + + ${project.artifactId} + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + true + + ${project.artifactId}-${project.version} + + + + default-nar + package + + nar + + + + + + + + ``` + +2. Generate a NAR file in the `target` directory. + + ```script + + mvn clean install + + ``` + +#### Step 4: configure and restart broker + +1. Configure the following parameters in the `broker.conf` file (or the `standalone.conf` file). + + ```conf + + # Class name of pluggable entry filters + # Multiple classes need to be separated by commas. + entryFilterNames=entryFilter1,entryFilter2,entryFilter3 + # The directory for all entry filter implementations + entryFiltersDirectory=tempDir + + ``` + +2. Restart your broker. + + You can see the following broker log if the plug-in is successfully loaded. + + ```text + + Successfully loaded entry filter for name `{name of your entry filter}` + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/develop-schema.md b/site2/website/versioned_docs/version-2.10.x/develop-schema.md new file mode 100644 index 0000000000000..2d4461a5ea2b5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/develop-schema.md @@ -0,0 +1,62 @@ +--- +id: develop-schema +title: Custom schema storage +sidebar_label: "Custom schema storage" +original_id: develop-schema +--- + +By default, Pulsar stores data type [schemas](concepts-schema-registry.md) in [Apache BookKeeper](https://bookkeeper.apache.org) (which is deployed alongside Pulsar). You can, however, use another storage system if you wish. This doc walks you through creating your own schema storage implementation. + +In order to use a non-default (i.e. non-BookKeeper) storage system for Pulsar schemas, you need to implement two Java interfaces: [`SchemaStorage`](#schemastorage-interface) and [`SchemaStorageFactory`](#schemastoragefactory-interface). + +## SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +> For a full-fledged example schema storage implementation, see the [`BookKeeperSchemaStorage`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +## SchemaStorageFactory interface + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +> For a full-fledged example schema storage factory implementation, see the [`BookKeeperSchemaStorageFactory`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +## Deployment + +In order to use your custom schema storage implementation, you'll need to: + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. +1. Add that jar to the `lib` folder in your Pulsar [binary or source distribution](getting-started-standalone.md#installing-pulsar). +1. Change the `schemaRegistryStorageClassName` configuration in [`broker.conf`](reference-configuration.md#broker) to your custom factory class (i.e. the `SchemaStorageFactory` implementation, not the `SchemaStorage` implementation). +1. Start up Pulsar. diff --git a/site2/website/versioned_docs/version-2.10.x/develop-tools.md b/site2/website/versioned_docs/version-2.10.x/develop-tools.md new file mode 100644 index 0000000000000..b5457790b8081 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/develop-tools.md @@ -0,0 +1,111 @@ +--- +id: develop-tools +title: Simulation tools +sidebar_label: "Simulation tools" +original_id: develop-tools +--- + +It is sometimes necessary create an test environment and incur artificial load to observe how well load managers +handle the load. The load simulation controller, the load simulation client, and the broker monitor were created as an +effort to make create this load and observe the effects on the managers more easily. + +## Simulation Client +The simulation client is a machine which will create and subscribe to topics with configurable message rates and sizes. +Because it is sometimes necessary in simulating large load to use multiple client machines, the user does not interact +with the simulation client directly, but instead delegates their requests to the simulation controller, which will then +send signals to clients to start incurring load. The client implementation is in the class +`org.apache.pulsar.testclient.LoadSimulationClient`. + +### Usage +To Start a simulation client, use the `pulsar-perf` script with the command `simulation-client` as follows: + +``` + +pulsar-perf simulation-client --port --service-url + +``` + +The client will then be ready to receive controller commands. +## Simulation Controller +The simulation controller send signals to the simulation clients, requesting them to create new topics, stop old +topics, change the load incurred by topics, as well as several other tasks. It is implemented in the class +`org.apache.pulsar.testclient.LoadSimulationController` and presents a shell to the user as an interface to send +command with. + +### Usage +To start a simulation controller, use the `pulsar-perf` script with the command `simulation-controller` as follows: + +``` + +pulsar-perf simulation-controller --cluster --client-port +--clients + +``` + +The clients should already be started before the controller is started. You will then be presented with a simple prompt, +where you can issue commands to simulation clients. Arguments often refer to tenant names, namespace names, and topic +names. In all cases, the BASE name of the tenants, namespaces, and topics are used. For example, for the topic +`persistent://my_tenant/my_cluster/my_namespace/my_topic`, the tenant name is `my_tenant`, the namespace name is +`my_namespace`, and the topic name is `my_topic`. The controller can perform the following actions: + +* Create a topic with a producer and a consumer + * `trade [--rate ] + [--rand-rate ,] + [--size ]` +* Create a group of topics with a producer and a consumer + * `trade_group [--rate ] + [--rand-rate ,] + [--separation ] [--size ] + [--topics-per-namespace ]` +* Change the configuration of an existing topic + * `change [--rate ] + [--rand-rate ,] + [--size ]` +* Change the configuration of a group of topics + * `change_group [--rate ] [--rand-rate ,] + [--size ] [--topics-per-namespace ]` +* Shutdown a previously created topic + * `stop ` +* Shutdown a previously created group of topics + * `stop_group ` +* Copy the historical data from one ZooKeeper to another and simulate based on the message rates and sizes in that history + * `copy [--rate-multiplier value]` +* Simulate the load of the historical data on the current ZooKeeper (should be same ZooKeeper being simulated on) + * `simulate [--rate-multiplier value]` +* Stream the latest data from the given active ZooKeeper to simulate the real-time load of that ZooKeeper. + * `stream [--rate-multiplier value]` + +The "group" arguments in these commands allow the user to create or affect multiple topics at once. Groups are created +when calling the `trade_group` command, and all topics from these groups may be subsequently modified or stopped +with the `change_group` and `stop_group` commands respectively. All ZooKeeper arguments are of the form +`zookeeper_host:port`. + +### Difference Between Copy, Simulate, and Stream +The commands `copy`, `simulate`, and `stream` are very similar but have significant differences. `copy` is used when +you want to simulate the load of a static, external ZooKeeper on the ZooKeeper you are simulating on. Thus, +`source zookeeper` should be the ZooKeeper you want to copy and `target zookeeper` should be the ZooKeeper you are +simulating on, and then it will get the full benefit of the historical data of the source in both load manager +implementations. `simulate` on the other hand takes in only one ZooKeeper, the one you are simulating on. It assumes +that you are simulating on a ZooKeeper that has historical data for `SimpleLoadManagerImpl` and creates equivalent +historical data for `ModularLoadManagerImpl`. Then, the load according to the historical data is simulated by the +clients. Finally, `stream` takes in an active ZooKeeper different than the ZooKeeper being simulated on and streams +load data from it and simulates the real-time load. In all cases, the optional `rate-multiplier` argument allows the +user to simulate some proportion of the load. For instance, using `--rate-multiplier 0.05` will cause messages to +be sent at only `5%` of the rate of the load that is being simulated. + +## Broker Monitor +To observe the behavior of the load manager in these simulations, one may utilize the broker monitor, which is +implemented in `org.apache.pulsar.testclient.BrokerMonitor`. The broker monitor will print tabular load data to the +console as it is updated using watchers. + +### Usage +To start a broker monitor, use the `monitor-brokers` command in the `pulsar-perf` script: + +``` + +pulsar-perf monitor-brokers --connect-string + +``` + +The console will then continuously print load data until it is interrupted. + diff --git a/site2/website/versioned_docs/version-2.10.x/developing-binary-protocol.md b/site2/website/versioned_docs/version-2.10.x/developing-binary-protocol.md new file mode 100644 index 0000000000000..9787f91e3624f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/developing-binary-protocol.md @@ -0,0 +1,637 @@ +--- +id: developing-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: "Binary protocol" +original_id: developing-binary-protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a [`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:--------------|:----------------------------------------------------------------------------------------|:----------------| +| `totalSize` | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| `commandSize` | The size of the protobuf-serialized command | 4 | +| `message` | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Required or optional| Description | Size (in bytes) | +|:-----------------------------------|:----------|:--------------------------------------------------------------------------------------------|:----------------| +| `totalSize` | Required | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| `commandSize` | Required | The size of the protobuf-serialized command | 4 | +| `message` | Required | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| `magicNumberOfBrokerEntryMetadata` | Optional | A 2-byte byte array (`0x0e02`) identifying the broker entry metadata
    **Note**: `magicNumberOfBrokerEntryMetadata` , `brokerEntryMetadataSize`, and `brokerEntryMetadata` should be used **together**. | 2 | +| `brokerEntryMetadataSize` | Optional | The size of the broker entry metadata | 4 | +| `brokerEntryMetadata` | Optional | The broker entry metadata stored as a binary protobuf message | | +| `magicNumber` | Required | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| `checksum` | Required | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| `metadataSize` | Required | The size of the message [metadata](#message-metadata) | 4 | +| `metadata` | Required | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| `payload` | Required | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Broker entry metadata + +Broker entry metadata is stored alongside the message metadata as a serialized protobuf message. +It is created by the broker when the message arrived at the broker and passed without changes to the consumer if configured. + +| Field | Required or optional | Description | +|:-------------------|:----------------|:------------------------------------------------------------------------------------------------------------------------------| +| `broker_timestamp` | Optional | The timestamp when a message arrived at the broker (`id est` as the number of milliseconds since January 1st, 1970 in UTC) | +| `index` | Optional | The index of the message. It is assigned by the broker. + +If you want to use broker entry metadata for **brokers**, configure the [`brokerEntryMetadataInterceptors`](reference-configuration.md#broker) parameter in the `broker.conf` file. + +If you want to use broker entry metadata for **consumers**: + +1. Use the client protocol version [18 or later](https://github.com/apache/pulsar/blob/ca37e67211feda4f7e0984e6414e707f1c1dfd07/pulsar-common/src/main/proto/PulsarApi.proto#L259). + +2. Configure the [`brokerEntryMetadataInterceptors`](reference-configuration.md#broker) parameter and set the [`enableExposingBrokerEntryMetadataToClient`](reference-configuration.md#broker) parameter to `true` in the `broker.conf` file. + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed without changes to the consumer. + +| Field | Required or optional | Description | +|:-------------------------|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | Required | The name of the producer that published the message | +| `sequence_id` | Required | The sequence ID of the message, assigned by producer | +| `publish_time` | Required | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | Required | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` | Optional | Indicates that the message has been replicated and specifies the name of the [cluster](reference-terminology.md#cluster) where the message was originally published | +| `partition_key` | Optional | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose. Partition key is used as the message key. | +| `compression` | Optional | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` | Optional | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` | Optional | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Required or optional | Description | +|:----------------|:---------------------|:-----------------------------------------------------------| +| `metadataSizeN` | Required |The size of the single message metadata serialized Protobuf | +| `metadataN` | Required |Single message metadata | +| `payloadN` | Required |Message payload passed by application | + +Each metadata field looks like this; + +| Field | Required or optional | Description | +|:----------------|:----------------------|:--------------------------------------------------------| +| `properties` | Required | Application-defined properties | +| `partition key` | Optional | Key to indicate the hashing to a particular partition | +| `payload_size` | Required | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf + +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} + +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf + +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} + +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer ID negotiated before. + +![Producer interaction](/assets/binary-protocol-producer.png) + +If the client does not receive a response indicating producer creation success or failure, +the client should first send a command to close the original producer before sending a +command to re-attempt producer creation. + +##### Command Producer + +```protobuf + +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf + +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} + +``` + +Parameters: + * `request_id` → The original ID of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. If a producer has not yet been created for the +connection, the broker will terminate the connection. This command is used +in a frame that includes command as well as message payload, for which the +complete format is specified in the [payload commands](#payload-commands) section. + +```protobuf + +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} + +``` + +Parameters: + * `producer_id` → The ID of an existing producer + * `sequence_id` → Each message has an associated sequence ID which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of a messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + +```protobuf + +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `producer_id` → The ID of producer originating the send request + * `sequence_id` → The sequence ID of the published message + * `message_id` → The message ID assigned by the system to the published message + Unique within a single cluster. Message ID is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique ID is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +If the client does not receive a response to a `Producer` command within a timeout, +the client must first send a `CloseProducer` command before sending another +`Producer` command. The client does not need to await a response to the `CloseProducer` +command before sending the next `Producer` command. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/assets/binary-protocol-consumer.png) + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued half of the messages in the queue, the consumer +sends permits to the broker to ask for more messages (equals to half of the messages in the queue). + +For example, if the queue size is 1000 and the consumer consumes 500 messages in the queue. +Then the consumer sends permits to the broker to ask for 500 messages. + +##### Command Subscribe + +```protobuf + +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover, Key_Shared + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf + +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} + +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf + +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf + +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + * `properties` → *(optional)* Reserved configuration items + * `txnid_most_bits` → *(optional)* Same as Transaction Coordinator ID, `txnid_most_bits` and `txnid_least_bits` + uniquely identify a transaction. + * `txnid_least_bits` → *(optional)* The ID of the transaction opened in a transaction coordinator, + `txnid_most_bits` and `txnid_least_bits`uniquely identify a transaction. + * `request_id` → *(optional)* The ID for handling response and timeout. + + + ##### Command AckResponse + +An `AckResponse` is the broker’s response to acknowledge a request sent by the client. It contains the `consumer_id` sent in the request. +If a transaction is used, it contains both the Transaction ID and the Request ID that are sent in the request. The client finishes the specific request according to the Request ID. If the `error` field is set, it indicates that the request has failed. + +An example of `AckResponse` with redirection: + +```protobuf + +message CommandAckResponse { + "consumer_id" : 1, + "txnid_least_bits" = 0, + "txnid_most_bits" = 1, + "request_id" = 5 +} + +``` + +##### Command CloseConsumer + +***Note***: **This command can be sent by either producer or broker*. + +This command behaves the same as [`CloseProducer`](#command-closeproducer) + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retrieve Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the [admin API](admin-api-topics.md#look-up-topics-owner-broker) docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/assets/binary-protocol-topic-lookup.png) + +```protobuf + +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} + +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} + +``` + +Example of lookup response with redirection: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} + +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf + +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} + +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf + +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} + +``` + +## Protobuf interface + +All Pulsar's Protobuf definitions can be found {@inject: github:here:/pulsar-common/src/main/proto/PulsarApi.proto}. diff --git a/site2/website/versioned_docs/version-2.10.x/functions-cli.md b/site2/website/versioned_docs/version-2.10.x/functions-cli.md new file mode 100644 index 0000000000000..c9fcfa201525f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-cli.md @@ -0,0 +1,198 @@ +--- +id: functions-cli +title: Pulsar Functions command line tool +sidebar_label: "Reference: CLI" +original_id: functions-cli +--- + +The following tables list Pulsar Functions command-line tools. You can learn Pulsar Functions modes, commands, and parameters. + +## localrun + +Run Pulsar Functions locally, rather than deploying it to the Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +broker-service-url | The URL for the Pulsar broker. | | +classname | The class name of a Pulsar Function.| | +client-auth-params | Client authentication parameter. | | +client-auth-plugin | Client authentication plugin using which function-process can connect to broker. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +hostname-verification-enabled | Enable hostname verification. | false +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +instance-id-offset | Start the instanceIds from this offset. | 0 +log-topic | The topic to which the logs a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +tls-allow-insecure | Allow insecure tls connection. | false +tls-trust-cert-path | tls trust cert file path. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +use-tls | Use tls connection. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + + +## create + +Create and deploy a Pulsar Function in cluster mode. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## delete + +Delete a Pulsar Function that is running on a Pulsar cluster. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## update + +Update a Pulsar Function that has been deployed to a Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime). | | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +update-auth-data | Whether or not to update the auth data. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## get + +Fetch information about a Pulsar Function. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## restart + +Restart function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## stop + +Stops function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## start + +Starts a stopped function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | diff --git a/site2/website/versioned_docs/version-2.10.x/functions-debug.md b/site2/website/versioned_docs/version-2.10.x/functions-debug.md new file mode 100644 index 0000000000000..c1f19abda6465 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-debug.md @@ -0,0 +1,538 @@ +--- +id: functions-debug +title: Debug Pulsar Functions +sidebar_label: "How-to: Debug" +original_id: functions-debug +--- + +You can use the following methods to debug Pulsar Functions: + +* [Captured stderr](functions-debug.md#captured-stderr) +* [Use unit test](functions-debug.md#use-unit-test) +* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) +* [Use log topic](functions-debug.md#use-log-topic) +* [Use Functions CLI](functions-debug.md#use-functions-cli) + +## Captured stderr + +Function startup information and captured stderr output is written to `logs/functions////-.log` + +This is useful for debugging why a function fails to start. + +## Use unit test + +A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. + +For example, if you have the following Pulsar Function: + +```java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +You can write a simple unit test to test Pulsar Function. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} + +``` + +The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} + +``` + +## Debug with localrun mode +When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. + +In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. + +:::note + +Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. + +::: + +You can launch your function in the following manner. + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); + +``` + +So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. + +The following example illustrates how to programmatically launch a function in localrun mode. + +```java + +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} + +``` + +To use localrun mode programmatically, add the following dependency. + +```xml + + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). + +:::note + +Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. + +::: + +## Use log topic + +In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +**Example** + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. + +**Example** + +```bash + +$ bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +The message published to log topic contains several properties for better reasoning: +- `loglevel` -- the level of the log message. +- `fqn` -- fully qualified function name pushes this log message. +- `instance` -- the ID of the function instance pushes this log message. + +## Use Functions CLI + +With [Pulsar Functions CLI](reference-pulsar-admin.md#functions), you can debug Pulsar Functions with the following subcommands: + +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +:::tip + +For complete commands of **Pulsar Functions CLI**, see [here](reference-pulsar-admin.md#functions)。 + +::: + +### `get` + +Get information about a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions get options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +:::tip + +`--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. + +::: + +**Example** + +You can specify `--fqfn` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 + +``` + +Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 + +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} + +``` + +### `status` + +Check the current status of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions status options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```json + +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} + +``` + +### `stats` + +Get the current stats of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions stats options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function.
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +The output is shown as follows: + +```json + +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} + +``` + +### `list` + +List all Pulsar Functions running under a specific tenant and namespace. + +**Usage** + +```bash + +$ pulsar-admin functions list options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + +As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. + +```text + +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 + +``` + +### `trigger` + +Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. + +**Usage** + +```bash + +$ pulsar-admin functions trigger options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. +|`--topic`|The topic name that a Pulsar Function consumes from. +|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. +|`--trigger-value`|The value to trigger a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" + +``` + +As shown below, the `trigger` command returns the following result: + +```text + +This is my function! + +``` + +:::note + +You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. + +```text + +Function in trigger function has unidentified topic +Reason: Function in trigger function has unidentified topic + +``` + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/functions-deploy.md b/site2/website/versioned_docs/version-2.10.x/functions-deploy.md new file mode 100644 index 0000000000000..826804db6bbb7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-deploy.md @@ -0,0 +1,262 @@ +--- +id: functions-deploy +title: Deploy Pulsar Functions +sidebar_label: "How-to: Deploy" +original_id: functions-deploy +--- + +## Requirements + +To deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: + +* You can run a [standalone cluster](getting-started-standalone.md) locally on your own machine. +* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), DC/OS, and more. + +If you run a non-[standalone](reference-terminology.md#standalone) cluster, you need to obtain the service URL for the cluster. How you obtain the service URL depends on how you deploy your Pulsar cluster. + +If you want to deploy and trigger Python user-defined functions, you need to install [the pulsar python client](client-libraries-python.md) on all the machines running [functions workers](functions-worker.md). + +## Command-line interface + +Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](reference-pulsar-admin.md#functions) interface, which contains commands such as [`create`](reference-pulsar-admin.md#functions-create) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](reference-pulsar-admin.md#trigger) for [triggering](#triggering-pulsar-functions) functions, [`list`](reference-pulsar-admin.md#list-2) for listing deployed functions. + +To learn more commands, refer to [`pulsar-admin functions`](reference-pulsar-admin.md#functions). + +### Default arguments + +When managing Pulsar Functions, you need to specify a variety of information about functions, including tenant, namespace, input and output topics, and so on. However, some parameters have default values if you do not specify values for them. The following table lists the default values. + +Parameter | Default +:---------|:------- +Function name | You can specify any value for the class name (except org, library, or similar class names). For example, when you specify the flag `--classname org.example.MyFunction`, the function name is `MyFunction`. +Tenant | Derived from names of the input topics. If the input topics are under the `marketing` tenant, which means the topic names have the form `persistent://marketing/{namespace}/{topicName}`, the tenant is `marketing`. +Namespace | Derived from names of the input topics. If the input topics are under the `asia` namespace under the `marketing` tenant, which means the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace is `asia`. +Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming`, and the function name is `exclamation`, then the name of the output topic is `incoming-exclamation-output`. +Subscription type | For `at-least-once` and `at-most-once` [processing guarantees](functions-overview.md#processing-guarantees), the [`SHARED`](concepts-messaging.md#shared) mode is applied by default; for `effectively-once` guarantees, the [`FAILOVER`](concepts-messaging.md#failover) mode is applied. +Processing guarantees | [`ATLEAST_ONCE`](functions-overview.md#processing-guarantees) +Pulsar service URL | `pulsar://localhost:6650` + +### Example of default arguments + +Take the `create` command as an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --classname org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 + +``` + +The function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +## Local run mode + +If you run a Pulsar Function in **local run** mode, it runs on the machine from which you enter the commands (on your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, and so on). The following is a [`localrun`](reference-pulsar-admin.md#localrun) command example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +By default, the function connects to a Pulsar cluster running on the same machine, via a local [broker](reference-terminology.md#broker) service URL of `pulsar://localhost:6650`. If you use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. The following is an example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --broker-service-url pulsar://my-cluster-host:6650 \ + # Other function parameters + +``` + +## Cluster mode + +When you run a Pulsar Function in **cluster** mode, the function code is uploaded to a Pulsar broker and runs *alongside the broker* rather than in your [local environment](#local-run-mode). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#create-1) command. + +```bash + +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +### Update functions in cluster mode + +You can use the [`update`](reference-pulsar-admin.md#update-1) command to update a Pulsar Function running in cluster mode. The following command updates the function created in the [cluster mode](#cluster-mode) section. + +```bash + +$ bin/pulsar-admin functions update \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic + +``` + +### Parallelism + +Pulsar Functions run as processes or threads, which are called **instances**. When you run a Pulsar Function, it runs as a single instance by default. With one localrun command, you can only run a single instance of a function. If you want to run multiple instances, you can use localrun command multiple times. + +When you create a function, you can specify the *parallelism* of a function (the number of instances to run). You can set the parallelism factor using the `--parallelism` flag of the [`create`](reference-pulsar-admin.md#functions-create) command. + +```bash + +$ bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info + +``` + +You can adjust the parallelism of an already created function using the [`update`](reference-pulsar-admin.md#update-1) interface. + +```bash + +$ bin/pulsar-admin functions update \ + --parallelism 5 \ + # Other function + +``` + +If you specify a function configuration via YAML, use the `parallelism` parameter. The following is a config file example. + +```yaml + +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters + +``` + +The following is corresponding update command. + +```bash + +$ bin/pulsar-admin functions update \ + --function-config-file function-config.yaml + +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster mode](#cluster-mode), you can specify the resources that are assigned to each function [instance](#parallelism). + +Resource | Specified as | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Kubernetes +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +The following function creation command allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --classname org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 + +``` + +> #### Resources are *per instance* +> The resources that you apply to a given Pulsar Function are applied to each instance of the function. For example, if you apply 8 GB of RAM to a function with a parallelism of 5, you are applying 40 GB of RAM for the function in total. Make sure that you take the parallelism (the number of instances) factor into your resource calculations. + +### Use Package management service + +Package management enables version management and simplifies the upgrade and rollback processes for Functions, Sinks, and Sources. When you use the same function, sink and source in different namespaces, you can upload them to a common package management system. + +To use [Package management service](admin-api-packages.md), ensure that the package management service has been enabled in your cluster by setting the following properties in `broker.conf`. + +> Note: Package management service is not enabled by default. + +```yaml + +enablePackagesManagement=true +packagesManagementStorageProvider=org.apache.pulsar.packages.management.storage.bookkeeper.BookKeeperPackagesStorageProvider +packagesReplicas=1 +packagesManagementLedgerRootPath=/ledgers + +``` + +With Package management service enabled, you can upload your function packages by [upload a package](admin-api-packages.md#upload-a-package) to the service and get the [package URL](admin-api-packages.md#package-url). + +When you have a ready to use package URL, you can create the function with package URL by setting `--jar`, `--py`, or `--go` to the package URL with `pulsar-admin functions create`. + +## Trigger Pulsar Functions + +If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function output (if any) via the command line. + +> Triggering a function is to invoke a function by producing a message on one of the input topics. With the [`pulsar-admin functions trigger`](reference-pulsar-admin.md#trigger) command, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +To learn how to trigger a function, you can start with Python function that returns a simple string based on the input. + +```python + +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) + +``` + +You can run the function in [local run mode](functions-deploy.md#local-run-mode). + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --classname myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out + +``` + +Then assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. + +```bash + +$ bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription + --num-messages 0 # Listen indefinitely + +``` + +And then you can trigger the function. + +```bash + +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc \ + --trigger-value "hello world" + +``` + +The consumer listening on the output topic produces something as follows in the log. + +``` + +----- got message ----- +This function has been triggered with a value of hello world + +``` + +> #### Topic info is not required +> In the `trigger` command, you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you do not need to know the function input topics. diff --git a/site2/website/versioned_docs/version-2.10.x/functions-develop.md b/site2/website/versioned_docs/version-2.10.x/functions-develop.md new file mode 100644 index 0000000000000..c32199517cfcc --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-develop.md @@ -0,0 +1,1678 @@ +--- +id: functions-develop +title: Develop Pulsar Functions +sidebar_label: "How-to: Develop" +original_id: functions-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +You learn how to develop Pulsar Functions with different APIs for Java, Python and Go. + +## Available APIs +In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). +Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). +Extended Pulsar Function SDK for Java | An extension to Pulsar-specific libraries, providing the initialization and close interfaces in Java. | Functions that require initializing and releasing external resources. + +### Language-native interface +The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. + +````mdx-code-block + + + +```Java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + + + +```python + +def process(input): + return "{}!".format(input) + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +:::note + +You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. +If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to +start the functions. In this case, you can create a symlink. Your system will fail if +you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). + +```bash + +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +``` + +::: + + + + +```` + +### Pulsar Function SDK for Java/Python/Go +The following example uses Pulsar Functions SDK. +````mdx-code-block + + + +```Java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + + + +```python + +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + + + +```Go + +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/inputFunc/inputFunc.go#L20-L36). + + + + +```` + +### Extended Pulsar Function SDK for Java +This extended Pulsar Function SDK provides two additional interfaces to initialize and release external resources. +- By using the `initialize` interface, you can initialize external resources which only need one-time initialization when the function instance starts. +- By using the `close` interface, you can close the referenced external resources when the function instance closes. + +:::note + +The extended Pulsar Function SDK for Java is available in Pulsar 2.10.0 and later versions. +Before using it, you need to set up Pulsar Function worker 2.10.0 or later versions. + +::: + +The following example uses the extended interface of Pulsar Function SDK for Java to initialize RedisClient when the function instance starts and release it when the function instance closes. + +````mdx-code-block + + + +```Java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import io.lettuce.core.RedisClient; + +public class InitializableFunction implements Function { + private RedisClient redisClient; + + private void initRedisClient(Map connectInfo) { + redisClient = RedisClient.create(connectInfo.get("redisURI")); + } + + @Override + public void initialize(Context context) { + Map connectInfo = context.getUserConfigMap(); + redisClient = initRedisClient(connectInfo); + } + + @Override + public String process(String input, Context context) { + String value = client.get(key); + return String.format("%s-%s", input, value); + } + + @Override + public void close() { + redisClient.close(); + } +} + +``` + + + + +```` + +## Schema registry +Pulsar has a built-in schema registry and is bundled with popular schema types, such as Avro, JSON and Protobuf. Pulsar Functions can leverage the existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. + +## SerDe +SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. + +````mdx-code-block + + + +When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: `String`, `Double`, `Integer`, `Float`, `Long`, `Short`, and `Byte`. + +To customize Java types, you need to implement the following interface. + +```java + +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} + +``` + +SerDe works in the following ways in Java Functions. +- If the input and output topics have schema, Pulsar Functions use schema for SerDe. +- If the input or output topics do not exist, Pulsar Functions adopt the following rules to determine SerDe: + - If the schema type is specified, Pulsar Functions use the specified schema type. + - If SerDe is specified, Pulsar Functions use the specified SerDe, and the schema type for input and output topics is `Byte`. + - If neither the schema type nor SerDe is specified, Pulsar Functions use the built-in SerDe. For non-primitive schema type, the built-in SerDe serializes and deserializes objects in the `JSON` format. + + + + +In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. + +You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 + +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. +2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe. + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you work with simple types like strings, Booleans, integers. +`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. + + + + +Currently, the feature is not available in Go. + + + + +```` + +### Example +Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. + +````mdx-code-block + + + +```java + +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} + +``` + +To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java + +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} + +``` + +To apply this customized SerDe to a particular Pulsar Function, you need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of [`create`](reference-pulsar-admin.md#create-1) operation. + +```bash + +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes + +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. + + + + +```python + +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content + +``` + +In order to use this class in Pulsar Functions, you have two options: + +1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. +2. You can create your own SerDe class. The following is an example. + + ```python + + from pulsar import SerDe + + class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + + ``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). + + + + +```` + +In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Context +Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. + +* The name and ID of a Pulsar Function. +* The message ID of each message. Each Pulsar message is automatically assigned with an ID. +* The key, event time, properties and partition key of each message. +* The name of the topic to which the message is sent. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](#serde). +* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. +* The ID of the Pulsar Functions instance running the function. +* The version of the function. +* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. +* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. +* An interface for recording [metrics](#metrics). +* An interface for storing and retrieving state in [state storage](#state-storage). +* A function to publish new messages onto arbitrary topics. +* A function to ack the message being processed (if auto-ack is disabled). +* (Java) get Pulsar admin client. + +````mdx-code-block + + + +The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. + +```java + +public interface Context { + Record getCurrentRecord(); + Collection getInputTopics(); + String getOutputTopic(); + String getOutputSchemaType(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + void incrCounter(String key, long amount); + void incrCounterAsync(String key, long amount); + long getCounter(String key); + long getCounterAsync(String key); + void putState(String key, ByteBuffer value); + void putStateAsync(String key, ByteBuffer value); + void deleteState(String key); + ByteBuffer getState(String key); + ByteBuffer getStateAsync(String key); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + Object getUserConfigValueOrDefault(String key, Object defaultValue); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); + CompletableFuture publish(String topicName, O object); + TypedMessageBuilder newOutputMessage(String topicName, Schema schema) throws PulsarClientException; + ConsumerBuilder newConsumerBuilder(Schema schema) throws PulsarClientException; + PulsarAdmin getPulsarAdmin(); + PulsarAdmin getPulsarAdmin(String clusterName); +} + +``` + +The following example uses several methods available via the `Context` object. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} + +``` + + + + +``` + +class ContextImpl(pulsar.Context): + def get_message_id(self): + ... + def get_message_key(self): + ... + def get_message_eventtime(self): + ... + def get_message_properties(self): + ... + def get_current_message_topic_name(self): + ... + def get_partition_key(self): + ... + def get_function_name(self): + ... + def get_function_tenant(self): + ... + def get_function_namespace(self): + ... + def get_function_id(self): + ... + def get_instance_id(self): + ... + def get_function_version(self): + ... + def get_logger(self): + ... + def get_user_config_value(self, key): + ... + def get_user_config_map(self): + ... + def record_metric(self, metric_name, metric_value): + ... + def get_input_topics(self): + ... + def get_output_topic(self): + ... + def get_output_serde_class_name(self): + ... + def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", + properties=None, compression_type=None, callback=None, message_conf=None): + ... + def ack(self, msgid, topic): + ... + def get_and_reset_metrics(self): + ... + def reset_metrics(self): + ... + def get_metrics(self): + ... + def incr_counter(self, key, amount): + ... + def get_counter(self, key): + ... + def del_counter(self, key): + ... + def put_state(self, key, value): + ... + def get_state(self, key): + ... + +``` + + + + +``` + +func (c *FunctionContext) GetInstanceID() int { + return c.instanceConf.instanceID +} + +func (c *FunctionContext) GetInputTopics() []string { + return c.inputTopics +} + +func (c *FunctionContext) GetOutputTopic() string { + return c.instanceConf.funcDetails.GetSink().Topic +} + +func (c *FunctionContext) GetFuncTenant() string { + return c.instanceConf.funcDetails.Tenant +} + +func (c *FunctionContext) GetFuncName() string { + return c.instanceConf.funcDetails.Name +} + +func (c *FunctionContext) GetFuncNamespace() string { + return c.instanceConf.funcDetails.Namespace +} + +func (c *FunctionContext) GetFuncID() string { + return c.instanceConf.funcID +} + +func (c *FunctionContext) GetFuncVersion() string { + return c.instanceConf.funcVersion +} + +func (c *FunctionContext) GetUserConfValue(key string) interface{} { + return c.userConfigs[key] +} + +func (c *FunctionContext) GetUserConfMap() map[string]interface{} { + return c.userConfigs +} + +func (c *FunctionContext) SetCurrentRecord(record pulsar.Message) { + c.record = record +} + +func (c *FunctionContext) GetCurrentRecord() pulsar.Message { + return c.record +} + +func (c *FunctionContext) NewOutputMessage(topic string) pulsar.Producer { + return c.outputMessage(topic) +} + +``` + +The following example uses several methods available via the `Context` object. + +``` + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func contextFunc(ctx context.Context) { + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/contextFunc/contextFunc.go#L29-L34). + + + + +```` + +### User config +When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--user-config` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --user-config '{"forbidden-word":"rosebud"}' + +``` + +````mdx-code-block + + + +The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Java function: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} + +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. + + + + +In Python function, you can access the configuration value like this. + +```python + +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input + +``` + +The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Python function: + +```python + +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) + +``` + + + + +The Go SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + --go path/to/go/binary + --user-config '{"word-of-the-day":"lackadaisical"}' + +``` + +To access that value in a Go function: + +```go + +func contextFunc(ctx context.Context) { + fc, ok := pf.FromContext(ctx) + if !ok { + logutil.Fatal("Function context is not defined") + } + + wotd := fc.GetUserConfValue("word-of-the-day") + + if wotd == nil { + logutil.Warn("The word of the day is empty") + } else { + logutil.Infof("The word of the day is %s", wotd.(string)) + } +} + +``` + + + + +```` + +### Logger + +````mdx-code-block + + + +Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + +#### Customize Function log level +Additionally, you can use the XML file, `functions_log4j2.xml`, to customize the function log level. +To customize the function log level, create or update `functions_log4j2.xml` in your Pulsar conf directory (for example, `/etc/pulsar/` on bare-metal, or `/pulsar/conf` on Kubernetes) to contain contents such as: + +```xml + + + pulsar-functions-instance + 30 + + + pulsar.log.appender + RollingFile + + + pulsar.log.level + debug + + + bk.log.level + debug + + + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + RollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.log + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}*log.gz + + + 30d + + + + + + BkRollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}.bk*log.gz + + + 30d + + + + + + + + org.apache.pulsar.functions.runtime.shaded.org.apache.bookkeeper + ${sys:bk.log.level} + false + + BkRollingFile + + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + + + +``` + +The properties set like: + +```xml + + + pulsar.log.level + debug + + +``` + +propagate to places where they are referenced, such as: + +```xml + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + +``` + +In the above example, debug level logging would be applied to ALL function logs. +This may be more verbose than you desire. To be more selective, you can apply different log levels to different classes or modules. For example: + +```xml + + + com.example.module + info + false + + ${sys:pulsar.log.appender} + + + +``` + +You can be more specific as well, such as applying a more verbose log level to a class in the module, such as: + +```xml + + + com.example.module.className + debug + false + + Console + + + +``` + +Each `` entry allows you to output the log to a target specified in the definition of the Appender. + +Additivity pertains to whether log messages will be duplicated if multiple Logger entries overlap. +To disable additivity, specify + +```xml + +false + +``` + +as shown in examples above. Disabling additivity prevents duplication of log messages when one or more `` entries contain classes or modules that overlap. + +The `` is defined in the `` section, such as: + +```xml + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + +``` + + + + +Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python + +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) + +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. +Additionally, you can specify the function log level through the broker XML file as described in [Customize Function log level](#customize-function-log-level). + + + + +The following Go Function example shows different log levels based on the function input. + +``` + +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/pf" + + log "github.com/apache/pulsar/pulsar-function-go/logutil" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} + +``` + +When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/logutil`, and you do not have to use the `getLogger()` context object. + +Additionally, you can specify the function log level through the broker XML file, as described here: [Customize Function log level](#customize-function-log-level) + + + + +```` + +### Pulsar admin + +Pulsar Functions using the Java SDK has access to the Pulsar admin client, which allows the Pulsar admin client to manage API calls to current Pulsar clusters or external clusters (if `external-pulsars` is provided). + +````mdx-code-block + + + +Below is an example of how to use the Pulsar admin client exposed from the Function `context`. + +``` + +import org.apache.pulsar.client.admin.PulsarAdmin; +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +/** + * In this particular example, for every input message, + * the function resets the cursor of the current function's subscription to a + * specified timestamp. + */ +public class CursorManagementFunction implements Function { + + @Override + public String process(String input, Context context) throws Exception { + PulsarAdmin adminClient = context.getPulsarAdmin(); + if (adminClient != null) { + String topic = context.getCurrentRecord().getTopicName().isPresent() ? + context.getCurrentRecord().getTopicName().get() : null; + String subName = context.getTenant() + "/" + context.getNamespace() + "/" + context.getFunctionName(); + if (topic != null) { + // 1578188166 below is a random-pick timestamp + adminClient.topics().resetCursor(topic, subName, 1578188166); + return "reset cursor successfully"; + } + } + return null; + } +} + +``` + +If you want your function to get access to the Pulsar admin client, you need to enable this feature by setting `exposeAdminClientEnabled=true` in the `functions_worker.yml` file. You can test whether this feature is enabled or not using the command `pulsar-admin functions localrun` with the flag `--web-service-url`. + +``` + +$ bin/pulsar-admin functions localrun \ + --jar my-functions.jar \ + --classname my.package.CursorManagementFunction \ + --web-service-url http://pulsar-web-service:8080 \ + # Other function configs + +``` + + + + +```` + +## Metrics + +Pulsar Functions allows you to deploy and manage processing functions that consume messages from and publish messages to Pulsar topics easily. It is important to ensure that the running functions are healthy at any time. Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can monitor Pulsar Functions that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar Functions expose the metrics that can be collected and used for monitoring the health of **Java, Python, and Go** functions. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + + For the complete list of the function metrics, see [here](reference-metrics.md#pulsar-functions). + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java and Python** functions. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here are examples of how to customize metrics for Java and Python functions. + +````mdx-code-block + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} + +``` + + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. + +```python + +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) + +``` + + + + +The Go SDK [`Context`](#context) object enables you to record metrics on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message: + +```go + +func metricRecorderFunction(ctx context.Context, in []byte) error { + inputstr := string(in) + fctx, ok := pf.FromContext(ctx) + if !ok { + return errors.New("get Go Functions Context error") + } + fctx.RecordMetric("hit-count", 1) + if inputstr == "eleven" { + fctx.RecordMetric("elevens-count", 1) + } + return nil +} + +``` + + + + +```` + +## Security + +If you want to enable security on Pulsar Functions, first you should enable security on [Functions Workers](functions-worker.md). For more details, refer to [Security settings](functions-worker.md#security-settings). + +Pulsar Functions can support the following providers: + +- ClearTextSecretsProvider +- EnvironmentBasedSecretsProvider + +> Pulsar Function supports ClearTextSecretsProvider by default. + +At the same time, Pulsar Functions provides two interfaces, **SecretsProvider** and **SecretsProviderConfigurator**, allowing users to customize secret provider. + +````mdx-code-block + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class GetSecretProviderFunction implements Function { + + @Override + public Void process(String input, Context context) throws Exception { + Logger LOG = context.getLogger(); + String secretProvider = context.getSecret(input); + + if (!secretProvider.isEmpty()) { + LOG.info("The secret provider is {}", secretProvider); + } else { + LOG.warn("No secret provider"); + } + + return null; + } +} + +``` + + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```python + +from pulsar import Function + +class GetSecretProviderFunction(Function): + def process(self, input, context): + logger = context.get_logger() + secret_provider = context.get_secret(input) + if secret_provider is None: + logger.warn('No secret provider') + else: + logger.info("The secret provider is {0}".format(secret_provider)) + +``` + + + + +Currently, the feature is not available in Go. + + + + +```` + +## State storage +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. + +Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. + +You can access states within Pulsar Java Functions using the `putState`, `putStateAsync`, `getState`, `getStateAsync`, `incrCounter`, `incrCounterAsync`, `getCounter`, `getCounterAsync` and `deleteState` calls on the context object. You can access states within Pulsar Python Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. + +:::note + +State storage is not available in Go. + +::: + +### API + +````mdx-code-block + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. + +#### incrCounter + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +The application can use `incrCounter` to change the counter of a given `key` by the given `amount`. + +#### incrCounterAsync + +```java + + /** + * Increment the builtin distributed counter referred by key + * but dont wait for the completion of the increment operation + * + * @param key The name of the key + * @param amount The amount to be incremented + */ + CompletableFuture incrCounterAsync(String key, long amount); + +``` + +The application can use `incrCounterAsync` to asynchronously change the counter of a given `key` by the given `amount`. + +#### getCounter + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +The application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### getCounterAsync + +```java + + /** + * Retrieve the counter value for the key, but don't wait + * for the operation to be completed + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + CompletableFuture getCounterAsync(String key); + +``` + +The application can use `getCounterAsync` to asynchronously retrieve the counter of a given `key` mutated by `incrCounterAsync`. + +#### putState + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +#### putStateAsync + +```java + + /** + * Update the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @param value state value of the key + */ + CompletableFuture putStateAsync(String key, ByteBuffer value); + +``` + +The application can use `putStateAsync` to asynchronously update the state of a given `key`. + +#### getState + +```java + + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); + +``` + +#### getStateAsync + +```java + + /** + * Retrieve the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @return the state value for the key. + */ + CompletableFuture getStateAsync(String key); + +``` + +The application can use `getStateAsync` to asynchronously retrieve the state of a given `key`. + +#### deleteState + +```java + + /** + * Delete the state value for the key. + * + * @param key name of the key + */ + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. + +#### incr_counter + +```python + + def incr_counter(self, key, amount): + ""incr the counter of a given key in the managed state"" + +``` + +Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +#### get_counter + +```python + + def get_counter(self, key): + """get the counter of a given key in the managed state""" + +``` + +Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### put_state + +```python + + def put_state(self, key, value): + """update the value of a given key in the managed state""" + +``` + +The key is a string, and the value is arbitrary binary data. + +#### get_state + +```python + + def get_state(self, key): + """get the value of a given key in the managed state""" + +``` + +#### del_counter + +```python + + def del_counter(self, key): + """delete the counter of a given key in the managed state""" + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +```` + +### Query State + +A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage +and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides +CLI commands for querying its state. + +```shell + +$ bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] + +``` + +If `--watch` is specified, the CLI will watch the value of the provided `state-key`. + +### Example + +````mdx-code-block + + + +{@inject: github:WordCountFunction:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example +demonstrating on how Application can easily store `state` in Pulsar Functions. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). + + + + +```python + +from pulsar import Function + +class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received string into multiple words on space. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/functions-metrics.md b/site2/website/versioned_docs/version-2.10.x/functions-metrics.md new file mode 100644 index 0000000000000..8add669316092 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-metrics.md @@ -0,0 +1,7 @@ +--- +id: functions-metrics +title: Metrics for Pulsar Functions +sidebar_label: "Metrics" +original_id: functions-metrics +--- + diff --git a/site2/website/versioned_docs/version-2.10.x/functions-overview.md b/site2/website/versioned_docs/version-2.10.x/functions-overview.md new file mode 100644 index 0000000000000..816d301e0fd0e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-overview.md @@ -0,0 +1,209 @@ +--- +id: functions-overview +title: Pulsar Functions overview +sidebar_label: "Overview" +original_id: functions-overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic. + + +## Goals +With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://heron.incubator.apache.org/), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: + +* Developer productivity (language-native vs Pulsar Functions SDK functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations +Pulsar Functions are inspired by (and take cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions can be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus. + +## Programming model +Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. + + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](functions-develop.md#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#word-count-example) + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +You can use Pulsar Functions to set up the following processing chain: + +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. + + +### Word count example + +If you implement the classic word count example using Pulsar Functions, it looks something like this: + +![Pulsar Functions word count example](/assets/pulsar-functions-word-count.png) + +To write the function in Java with [Pulsar Functions SDK for Java](functions-develop.md#available-apis), you can write the function as follows. + +```java + +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } +} + +``` + +Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count + +``` + +### Content-based routing example + +Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. + +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. + +![Pulsar Functions routing example](/assets/pulsar-functions-routing-example.png) + +If you implement this routing functionality in Python, it looks something like this: + +```python + +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + @staticmethod + def is_fruit(item): + return item in [b"apple", b"orange", b"pear", b"other fruits..."] + + @staticmethod + def is_vegetable(item): + return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) + +``` + +If this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --py ~/router.py \ + --classname router.RoutingFunction \ + --tenant public \ + --namespace default \ + --name route-fruit-veg \ + --inputs persistent://public/default/basket-items + +``` + +### Functions, messages and message types +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. +* [Schema Registry](functions-develop.md#schema-registry) +* [SerDe](functions-develop.md#serde) + + +## Fully Qualified Function Name (FQFN) +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: + +```http + +tenant/namespace/name + +``` + +FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. + +## Supported languages +Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop.md). + +## Processing guarantees +Pulsar Functions provide three different messaging semantics that you can apply to any function. + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). +**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). +**Effectively-once** delivery | Each message sent to the function will have one output associated with it. + + +### Apply processing guarantees to a function +You can set the processing guarantees for a Pulsar Function when you create the Function. The following [`pulsar-function create`](reference-pulsar-admin.md#create-1) command creates a function with effectively-once guarantees applied. + +```bash + +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs + +``` + +The available options for `--processing-guarantees` are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. + +### Update the processing guarantees of a function +You can change the processing guarantees applied to a function using the [`update`](reference-pulsar-admin.md#update-1) command. The following is an example. + +```bash + +$ bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/functions-package.md b/site2/website/versioned_docs/version-2.10.x/functions-package.md new file mode 100644 index 0000000000000..a995d5c158877 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-package.md @@ -0,0 +1,493 @@ +--- +id: functions-package +title: Package Pulsar Functions +sidebar_label: "How-to: Package" +original_id: functions-package +--- + +You can package Pulsar functions in Java, Python, and Go. Packaging the window function in Java is the same as [packaging a function in Java](#java). + +:::note + +Currently, the window function is not available in Python and Go. + +::: + +## Prerequisite + +Before running a Pulsar function, you need to start Pulsar. You can [run a standalone Pulsar in Docker](getting-started-docker.md), or [run Pulsar in Kubernetes](getting-started-helm.md). + +To check whether the Docker image starts, you can use the `docker ps` command. + +## Java + +To package a function in Java, complete the following steps. + +1. Create a new maven project with a pom file. In the following code sample, the value of `mainClass` is your package name. + + ```Java + + + + 4.0.0 + + java-function + java-function + 1.0-SNAPSHOT + + + + org.apache.pulsar + pulsar-functions-api + 2.6.0 + + + + + + + maven-assembly-plugin + + false + + jar-with-dependencies + + + + org.example.test.ExclamationFunction + + + + + + make-assembly + package + + assembly + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + + ``` + +2. Write a Java function. + + ``` + + package org.example.test; + + import java.util.function.Function; + + public class ExclamationFunction implements Function { + @Override + public String apply(String s) { + return "This is my function!"; + } + } + + ``` + + For the imported package, you can use one of the following interfaces: + - Function interface provided by Java 8: `java.util.function.Function` + - Pulsar Function interface: `org.apache.pulsar.functions.api.Function` + + The main difference between the two interfaces is that the `org.apache.pulsar.functions.api.Function` interface provides the context interface. When you write a function and want to interact with it, you can use context to obtain a wide variety of information and functionality for Pulsar Functions. + + The following example uses `org.apache.pulsar.functions.api.Function` interface with context. + + ``` + + package org.example.functions; + import org.apache.pulsar.functions.api.Context; + import org.apache.pulsar.functions.api.Function; + + import java.util.Arrays; + public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } + } + + ``` + +3. Package the Java function. + + ```bash + + mvn package + + ``` + + After the Java function is packaged, a `target` directory is created automatically. Open the `target` directory to check if there is a JAR package similar to `java-function-1.0-SNAPSHOT.jar`. + + +4. Run the Java function. + + (1) Copy the packaged jar file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Java function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname org.example.test.ExclamationFunction \ + --jar java-function-1.0-SNAPSHOT.jar \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name JavaFunction + + ``` + + The following log indicates that the Java function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Python + +Python Function supports the following three formats: + +- One python file +- ZIP file +- PIP + +### One python file + +To package a function with **one python file** in Python, complete the following steps. + +1. Write a Python function. + + ``` + + from pulsar import Function // import the Function module from Pulsar + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + + ``` + + In this example, when you write a Python function, you need to inherit the Function class and implement the `process()` method. + + `process()` mainly has two parameters: + + - `input` represents your input. + + - `context` represents an interface exposed by the Pulsar Function. You can get the attributes in the Python function based on the provided context object. + +2. Install a Python client. + + The implementation of a Python function depends on the Python client, so before deploying a Python function, you need to install the corresponding version of the Python client. + + ```bash + + pip install pulsar-client==2.6.0 + + ``` + +3. Run the Python Function. + + (1) Copy the Python function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname . \ + --py \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### ZIP file + +To package a function with the **ZIP file** in Python, complete the following steps. + +1. Prepare the ZIP file. + + The following is required when packaging the ZIP file of the Python Function. + + ```text + + Assuming the zip file is named as `func.zip`, unzip the `func.zip` folder: + "func/src" + "func/requirements.txt" + "func/deps" + + ``` + + Take [exclamation.zip](https://github.com/apache/pulsar/tree/master/tests/docker-images/latest-version-image/python-examples) as an example. The internal structure of the example is as follows. + + ```text + + . + ├── deps + │   └── sh-1.12.14-py2.py3-none-any.whl + └── src + └── exclamation.py + + ``` + +2. Run the Python Function. + + (1) Copy the ZIP file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname exclamation \ + --py \ + --inputs persistent://public/default/in-topic \ + --output persistent://public/default/out-topic \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### PIP + +The PIP method is only supported in Kubernetes runtime. To package a function with **PIP** in Python, complete the following steps. + +1. Configure the `functions_worker.yml` file. + + ```text + + #### Kubernetes Runtime #### + installUserCodeDependencies: true + + ``` + +2. Write your Python Function. + + ``` + + from pulsar import Function + import js2xml + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + // add your logic + return input + '!' + + ``` + + You can introduce additional dependencies. When Python Function detects that the file currently used is `whl` and the `installUserCodeDependencies` parameter is specified, the system uses the `pip install` command to install the dependencies required in Python Function. + +3. Generate the `whl` file. + + ```shell script + + $ cd $PULSAR_HOME/pulsar-functions/scripts/python + $ chmod +x generate.sh + $ ./generate.sh + # e.g: ./generate.sh /path/to/python /path/to/python/output 1.0.0 + + ``` + + The output is written in `/path/to/python/output`: + + ```text + + -rw-r--r-- 1 root staff 1.8K 8 27 14:29 pulsarfunction-1.0.0-py2-none-any.whl + -rw-r--r-- 1 root staff 1.4K 8 27 14:29 pulsarfunction-1.0.0.tar.gz + -rw-r--r-- 1 root staff 0B 8 27 14:29 pulsarfunction.whl + + ``` + +## Go + +To package a function in Go, complete the following steps. + +1. Write a Go function. + + Currently, Go function can be **only** implemented using SDK and the interface of the function is exposed in the form of SDK. Before using the Go function, you need to import "github.com/apache/pulsar/pulsar-function-go/pf". + + ``` + + import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" + ) + + func HandleRequest(ctx context.Context, input []byte) error { + fmt.Println(string(input) + "!") + return nil + } + + func main() { + pf.Start(HandleRequest) + } + + ``` + + You can use context to connect to the Go function. + + ``` + + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } + + ``` + + When writing a Go function, remember that + - In `main()`, you **only** need to register the function name to `Start()`. **Only** one function name is received in `Start()`. + - Go function uses Go reflection, which is based on the received function name, to verify whether the parameter list and returned value list are correct. The parameter list and returned value list **must be** one of the following sample functions: + + ``` + + func () + func () error + func (input) error + func () (output, error) + func (input) (output, error) + func (context.Context) error + func (context.Context, input) error + func (context.Context) (output, error) + func (context.Context, input) (output, error) + + ``` + +2. Build the Go function. + + ``` + + go build .go + + ``` + +3. Run the Go Function. + + (1) Copy the Go function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Go function with the following command. + + ``` + + ./bin/pulsar-admin functions localrun \ + --go [your go function path] + --inputs [input topics] \ + --output [output topic] \ + --tenant [default:public] \ + --namespace [default:default] \ + --name [custom unique go function name] + + ``` + + The following log indicates that the Go function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Start Functions in cluster mode +If you want to start a function in cluster mode, replace `localrun` with `create` in the commands above. The following log indicates that your function starts successfully. + + ```text + + "Created successfully" + + ``` + +For information about parameters on `--classname`, `--jar`, `--py`, `--go`, `--inputs`, run the command `./bin/pulsar-admin functions` or see [here](reference-pulsar-admin.md#functions). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/functions-runtime.md b/site2/website/versioned_docs/version-2.10.x/functions-runtime.md new file mode 100644 index 0000000000000..9a01dbf4da1d1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-runtime.md @@ -0,0 +1,406 @@ +--- +id: functions-runtime +title: Configure Functions runtime +sidebar_label: "Setup: Configure Functions runtime" +original_id: functions-runtime +--- + +You can use the following methods to run functions. + +- *Thread*: Invoke functions threads in functions worker. +- *Process*: Invoke functions in processes forked by functions worker. +- *Kubernetes*: Submit functions as Kubernetes StatefulSets by functions worker. + +:::note + +Pulsar supports adding labels to the Kubernetes StatefulSets and services while launching functions, which facilitates selecting the target Kubernetes objects. + +::: + +The differences of the thread and process modes are: +- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with functions worker. +- Process mode: when a function runs in process mode, it runs on the same machine that functions worker runs. + +## Configure thread runtime +It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.thread.ThreadRuntimeFactory +functionRuntimeFactoryConfigs: + threadGroupName: "Your Function Container Group" + +``` + +*Thread* runtime is only supported in Java function. + +## Configure process runtime +When you enable *Process* runtime, you do not need to configure anything. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.process.ProcessRuntimeFactory +functionRuntimeFactoryConfigs: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: + +``` + +*Process* runtime is supported in Java, Python, and Go functions. + +## Configure Kubernetes runtime + +When the functions worker generates Kubernetes manifests and apply the manifests, the Kubernetes runtime works. If you have run functions worker on Kubernetes, you can use the `serviceAccount` associated with the pod that the functions worker is running in. Otherwise, you can configure it to communicate with a Kubernetes cluster. + +The manifests, generated by the functions worker, include a `StatefulSet`, a `Service` (used to communicate with the pods), and a `Secret` for auth credentials (when applicable). The `StatefulSet` manifest (by default) has a single pod, with the number of replicas determined by the "parallelism" of the function. On pod boot, the pod downloads the function payload (via the functions worker REST API). The pod's container image is configurable, but must have the functions runtime. + +The Kubernetes runtime supports secrets, so you can create a Kubernetes secret and expose it as an environment variable in the pod. The Kubernetes runtime is extensible, you can implement classes and customize the way how to generate Kubernetes manifests, how to pass auth data to pods, and how to integrate secrets. + +:::tip + +For the rules of translating Pulsar object names into Kubernetes resource labels, see [here](admin-api-overview.md#how-to-define-pulsar-resource-names-when-running-pulsar-in-kubernetes). + +::: + +### Basic configuration + +It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.kubernetes.KubernetesRuntimeFactory +functionRuntimeFactoryConfigs: + # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker + k8Uri: + # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty + jobNamespace: + # The Kubernetes pod name to run the function instances. It is set to + # `pf----` if this setting is left to be empty + jobName: + # the docker image to run function instance. by default it is `apachepulsar/pulsar` + pulsarDockerImageName: + # the docker image to run function instance according to different configurations provided by users. + # By default it is `apachepulsar/pulsar`. + # e.g: + # functionDockerImages: + # JAVA: JAVA_IMAGE_NAME + # PYTHON: PYTHON_IMAGE_NAME + # GO: GO_IMAGE_NAME + functionDockerImages: + # "The image pull policy for image used to run function instance. By default it is `IfNotPresent` + imagePullPolicy: IfNotPresent + # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. + # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly + pulsarRootDir: + # The config admin CLI allows users to customize the configuration of the admin cli tool, such as: + # `/bin/pulsar-admin and /bin/pulsarctl`. By default it is `/bin/pulsar-admin`. If you want to use `pulsarctl` + # you need to set this setting accordingly + configAdminCLI: + # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, + # setting this to true is let function worker to submit functions to the same k8s cluster as function worker + # is running. setting this to false if your function worker is not running as a k8 pod. + submittingInsidePod: false + # setting the pulsar service url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar service url configured in worker service + pulsarServiceUrl: + # setting the pulsar admin url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar admin url configured in worker service + pulsarAdminUrl: + # The flag indicates to install user code dependencies. (applied to python package) + installUserCodeDependencies: + # The repository that pulsar functions use to download python dependencies + pythonDependencyRepository: + # The repository that pulsar functions use to download extra python dependencies + pythonExtraDependencyRepository: + # the custom labels that function worker uses to select the nodes for pods + customLabels: + # The expected metrics collection interval, in seconds + expectedMetricsCollectionInterval: 30 + # Kubernetes Runtime will periodically checkback on + # this configMap if defined and if there are any changes + # to the kubernetes specific stuff, we apply those changes + changeConfigMap: + # The namespace for storing change config map + changeConfigMapNamespace: + # The ratio cpu request and cpu limit to be set for a function/source/sink. + # The formula for cpu request is cpuRequest = userRequestCpu / cpuOverCommitRatio + cpuOverCommitRatio: 1.0 + # The ratio memory request and memory limit to be set for a function/source/sink. + # The formula for memory request is memoryRequest = userRequestMemory / memoryOverCommitRatio + memoryOverCommitRatio: 1.0 + # The port inside the function pod which is used by the worker to communicate with the pod + grpcPort: 9093 + # The port inside the function pod on which prometheus metrics are exposed + metricsPort: 9094 + # The directory inside the function pod where nar packages will be extracted + narExtractionDirectory: + # The classpath where function instance files stored + functionInstanceClassPath: + # Upload the builtin sources/sinks to BookKeeper. + # True by default. + uploadBuiltinSinksSources: true + # the directory for dropping extra function dependencies + # if it is not an absolute path, it is relative to `pulsarRootDir` + extraFunctionDependenciesDir: + # Additional memory padding added on top of the memory requested by the function per on a per instance basis + percentMemoryPadding: 10 + # The duration (in seconds) before the StatefulSet is deleted after a function stops or restarts. + # Value must be a non-negative integer. 0 indicates the StatefulSet is deleted immediately. + # Default is 5 seconds. + gracePeriodSeconds: 5 + +``` + +If you run functions worker embedded in a broker on Kubernetes, you can use the default settings. + +### Run standalone functions worker on Kubernetes + +If you run functions worker standalone (that is, not embedded) on Kubernetes, you need to configure `pulsarSerivceUrl` to be the URL of the broker and `pulsarAdminUrl` as the URL to the functions worker. + +For example, both Pulsar brokers and Function Workers run in the `pulsar` K8S namespace. The brokers have a service called `brokers` and the functions worker has a service called `func-worker`. The settings are as follows: + +```yaml + +pulsarServiceUrl: pulsar://broker.pulsar:6650 // or pulsar+ssl://broker.pulsar:6651 if using TLS +pulsarAdminUrl: http://func-worker.pulsar:8080 // or https://func-worker:8443 if using TLS + +``` + +### Run RBAC in Kubernetes clusters + +If you run RBAC in your Kubernetes cluster, make sure that the service account you use for running functions workers (or brokers, if functions workers run along with brokers) have permissions on the following Kubernetes APIs. + +- services +- configmaps +- pods +- apps.statefulsets + +The following is sufficient: + +```yaml + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: functions-worker +rules: +- apiGroups: [""] + resources: + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: functions-worker +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: functions-worker +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: functions-worker +subjectsKubernetesSec: +- kind: ServiceAccount + name: functions-worker + +``` + +If the service-account is not properly configured, an error message similar to this is displayed: + +```bash + +22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar +io.kubernetes.client.ApiException: Forbidden + at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] + at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] + +``` + +### Integrate Kubernetes secrets + +In order to safely distribute secrets, Pulsar Functions can reference Kubernetes secrets. To enable this, set the `secretsProviderConfiguratorClassName` to `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator`. + +You can create a secret in the namespace where your functions are deployed. For example, you deploy functions to the `pulsar-func` Kubernetes namespace, and you have a secret named `database-creds` with a field name `password`, which you want to mount in the pod as an environment variable called `DATABASE_PASSWORD`. The following functions configuration enables you to reference that secret and mount the value as an environment variable in the pod. + +```Yaml + +tenant: "mytenant" +namespace: "mynamespace" +name: "myfunction" +topicName: "persistent://mytenant/mynamespace/myfuncinput" +className: "com.company.pulsar.myfunction" + +secrets: + # the secret will be mounted from the `password` field in the `database-creds` secret as an env var called `DATABASE_PASSWORD` + DATABASE_PASSWORD: + path: "database-creds" + key: "password" + +``` + +### Enable token authentication + +When you enable authentication for your Pulsar cluster, you need a mechanism for the pod running your function to authenticate with the broker. + +The `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface provides support for any authentication mechanism. The `functionAuthProviderClassName` in `function-worker.yml` is used to specify your path to this implementation. + +Pulsar includes an implementation of this interface for token authentication, and distributes the certificate authority via the same implementation. The configuration is similar as follows: + +```Yaml + +functionAuthProviderClassName: org.apache.pulsar.functions.auth.KubernetesSecretsTokenAuthProvider + +``` + +For token authentication, the functions worker captures the token that is used to deploy (or update) the function. The token is saved as a secret and mounted into the pod. + +For custom authentication or TLS, you need to implement this interface or use an alternative mechanism to provide authentication. If you use token authentication and TLS encryption to secure the communication with the cluster, Pulsar passes your certificate authority (CA) to the client, so the client obtains what it needs to authenticate the cluster, and trusts the cluster with your signed certificate. + +:::note + +If you use tokens that expire when deploying functions, these tokens will expire. + +::: + +### Run clusters with authentication + +When you run a functions worker in a standalone process (that is, not embedded in the broker) in a cluster with authentication, you must configure your functions worker to interact with the broker and authenticate incoming requests. So you need to configure properties that the broker requires for authentication or authorization. + +For example, if you use token authentication, you need to configure the following properties in the `function-worker.yml` file. + +```Yaml + +clientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationToken +clientAuthenticationParameters: file:///etc/pulsar/token/admin-token.txt +configurationMetadataStoreUrl: zk:zookeeper-cluster:2181 # auth requires a connection to zookeeper +authenticationProviders: + - "org.apache.pulsar.broker.authentication.AuthenticationProviderToken" +authorizationEnabled: true +authenticationEnabled: true +superUserRoles: + - superuser + - proxy +properties: + tokenSecretKey: file:///etc/pulsar/jwt/secret # if using a secret token, key file must be DER-encoded + tokenPublicKey: file:///etc/pulsar/jwt/public.key # if using public/private key tokens, key file must be DER-encoded + +``` + +:::note + +You must configure both the Function Worker authorization or authentication for the server to authenticate requests and configure the client to be authenticated to communicate with the broker. + +::: + +### Customize Kubernetes runtime + +The Kubernetes integration enables you to implement a class and customize how to generate manifests. You can configure it by setting `runtimeCustomizerClassName` in the `functions-worker.yml` file and use the fully qualified class name. You must implement the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer` interface. + +The functions (and sinks/sources) API provides a flag, `customRuntimeOptions`, which is passed to this interface. + +To initialize the `KubernetesManifestCustomizer`, you can provide `runtimeCustomizerConfig` in the `functions-worker.yml` file. `runtimeCustomizerConfig` is passed to the `public void initialize(Map config)` function of the interface. `runtimeCustomizerConfig`is different from the `customRuntimeOptions` as `runtimeCustomizerConfig` is the same across all functions. If you provide both `runtimeCustomizerConfig` and `customRuntimeOptions`, you need to decide how to manage these two configurations in your implementation of `KubernetesManifestCustomizer`. + +Pulsar includes a built-in implementation. To use the basic implementation, set `runtimeCustomizerClassName` to `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer`. The built-in implementation initialized with `runtimeCustomizerConfig` enables you to pass a JSON document as `customRuntimeOptions` with certain properties to augment, which decides how the manifests are generated. If both `runtimeCustomizerConfig` and `customRuntimeOptions` are provided, `BasicKubernetesManifestCustomizer` uses `customRuntimeOptions` to override the configuration if there are conflicts in these two configurations. + +Below is an example of `customRuntimeOptions`. + +```json + +{ + "jobName": "jobname", // the k8s pod name to run this function instance + "jobNamespace": "namespace", // the k8s namespace to run this function in + "extractLabels": { // extra labels to attach to the statefulSet, service, and pods + "extraLabel": "value" + }, + "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods + "extraAnnotation": "value" + }, + "nodeSelectorLabels": { // node selector labels to add on to the pod spec + "customLabel": "value" + }, + "tolerations": [ // tolerations to add to the pod spec + { + "key": "custom-key", + "value": "value", + "effect": "NoSchedule" + } + ], + "resourceRequirements": { // values for cpu and memory should be defined as described here: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container + "requests": { + "cpu": 1, + "memory": "4G" + }, + "limits": { + "cpu": 2, + "memory": "8G" + } + } +} + +``` + +## Run clusters with geo-replication + +If you run multiple clusters tied together with geo-replication, it is important to use a different function namespace for each cluster. Otherwise, the function shares a namespace and potentially schedule across clusters. + +For example, if you have two clusters: `east-1` and `west-1`, you can configure the functions workers for `east-1` and `west-1` perspectively as follows. + +```Yaml + +pulsarFunctionsCluster: east-1 +pulsarFunctionsNamespace: public/functions-east-1 + +``` + +```Yaml + +pulsarFunctionsCluster: west-1 +pulsarFunctionsNamespace: public/functions-west-1 + +``` + +This ensures the two different Functions Workers use distinct sets of topics for their internal coordination. + +## Configure standalone functions worker + +When configuring a standalone functions worker, you need to configure properties that the broker requires, especially if you use TLS. And then Functions Worker can communicate with the broker. + +You need to configure the following required properties. + +```Yaml + +workerPort: 8080 +workerPortTls: 8443 # when using TLS +tlsCertificateFilePath: /etc/pulsar/tls/tls.crt # when using TLS +tlsKeyFilePath: /etc/pulsar/tls/tls.key # when using TLS +tlsTrustCertsFilePath: /etc/pulsar/tls/ca.crt # when using TLS +pulsarServiceUrl: pulsar://broker.pulsar:6650/ # or pulsar+ssl://pulsar-prod-broker.pulsar:6651/ when using TLS +pulsarWebServiceUrl: http://broker.pulsar:8080/ # or https://pulsar-prod-broker.pulsar:8443/ when using TLS +useTls: true # when using TLS, critical! + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/functions-worker.md b/site2/website/versioned_docs/version-2.10.x/functions-worker.md new file mode 100644 index 0000000000000..60eb84657919b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/functions-worker.md @@ -0,0 +1,405 @@ +--- +id: functions-worker +title: Deploy and manage functions worker +sidebar_label: "Setup: Pulsar Functions Worker" +original_id: functions-worker +--- +Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime.md). + +Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either based on your requirements. +- [run with brokers](#run-functions-worker-with-brokers) +- [run it separately](#run-functions-worker-separately) in a different broker + +:::note + +The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. + +::: + +## Run Functions-worker with brokers + +The following diagram illustrates the deployment of functions-workers running along with brokers. + +![assets/functions-worker-corun.png](/assets/functions-worker-corun.png) + +To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. + +```conf + +functionsWorkerEnabled=true + +``` + +If the `functionsWorkerEnabled` is set to `true`, the functions-worker is started as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. + +Before you run Functions-worker with broker, you have to configure Functions-worker, and then start it with brokers. + +### Configure Functions-Worker to run with brokers +In this mode, most of the settings are already inherited from your broker configuration (for example, configurationStore settings, authentication settings, and so on) since `functions-worker` is running as part of the broker. + +Pay attention to the following required settings when configuring functions-worker in this mode. + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be larger than `2`. +- `initializedDlogMetadata`: Whether to initialize distributed log metadata in runtime. If it is set to `true`, you must ensure that it has been initialized by `bin/pulsar initialize-cluster-metadata` command. + +If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. + +- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. +- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. +- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. + +### Configure Stateful-Functions to run with broker + +If you want to use Stateful-Functions related functions (for example, `putState()` and `queryState()` related interfaces), follow steps below. + +1. Enable the **streamStorage** service in the BookKeeper. + + Currently, the service uses the NAR package, so you need to set the configuration in `bookkeeper.conf`. + + ```text + + extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent + + ``` + + After starting bookie, use the following methods to check whether the streamStorage service is started correctly. + + Input: + + ```shell + + telnet localhost 4181 + + ``` + + Output: + + ```text + + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + + ``` + +2. Turn on this function in `functions_worker.yml`. + + ```text + + stateStorageServiceUrl: bk://:4181 + + ``` + + `bk-service-url` is the service URL pointing to the BookKeeper table service. + +### Start Functions-worker with broker + +Once you have configured the `functions_worker.yml` file, you can start or restart your broker. + +And then you can use the following command to verify if `functions-worker` is running well. + +```bash + +curl :8080/admin/v2/worker/cluster + +``` + +After entering the command above, a list of active function workers in the cluster is returned. The output is similar to the following. + +```json + +[{"workerId":"","workerHostname":"","port":8080}] + +``` + +## Run Functions-worker separately + +This section illustrates how to run `functions-worker` as a separate process in separate machines. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated.png) + +:::note + +In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. Also, while accessing the `functions-worker` to manage any of the functions, the `pulsar-admin` CLI tool or any of the clients should use the `workerHostname` and `workerPort` that you set in [Worker parameters](#worker-parameters) to generate an `--admin-url`. + +::: + +### Configure Functions-worker to run separately + +To run function-worker separately, you have to configure the following parameters. + +#### Worker parameters + +- `workerId`: The type is string. It is unique across clusters, which is used to identify a worker machine. +- `workerHostname`: The hostname of the worker machine. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. + +#### Function package parameter + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. + +#### Function metadata parameter + +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulsar web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. + +- `clientAuthenticationPlugin` +- `clientAuthenticationParameters` + +#### Customize Java runtime options + +If you want to pass additional arguments to the JVM command line to every process started by a function worker, +you can configure the `additionalJavaRuntimeArguments` parameter. + +``` + +additionalJavaRuntimeArguments: ['-XX:+ExitOnOutOfMemoryError','-Dfoo=bar'] + +``` + +This is very useful in case you want to: +- add JMV flags, like `-XX:+ExitOnOutOfMemoryError` +- pass custom system properties, like `-Dlog4j2.formatMsgNoLookups` + +:::note + +This feature applies only to Process and Kubernetes runtimes. + +::: + +#### Security settings + +If you want to enable security on functions workers, you *should*: +- [Enable TLS transport encryption](#enable-tls-transport-encryption) +- [Enable Authentication Provider](#enable-authentication-provider) +- [Enable Authorization Provider](#enable-authorization-provider) +- [Enable End-to-End Encryption](#enable-end-to-end-encryption) + +##### Enable TLS transport encryption + +To enable TLS transport encryption, configure the following settings. + +``` + +useTLS: true +pulsarServiceUrl: pulsar+ssl://localhost:6651/ +pulsarWebServiceUrl: https://localhost:8443 + +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem + +// The path to trusted certificates used by the Pulsar client to authenticate with Pulsar brokers +brokerClientTrustCertsFilePath: /path/to/ca.cert.pem + +``` + +For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + +##### Enable Authentication Provider + +To enable authentication on Functions Worker, you need to configure the following settings. + +:::note + +Substitute the *providers list* with the providers you want to enable. + +::: + +``` + +authenticationEnabled: true +authenticationProviders: [ provider1, provider2 ] + +``` + +For *TLS Authentication* provider, follow the example below to add the necessary settings. +See [TLS Authentication](security-tls-authentication.md) for more details. + +``` + +brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters: tlsCertFile:/path/to/admin.cert.pem,tlsKeyFile:/path/to/admin.key-pk8.pem + +authenticationEnabled: true +authenticationProviders: ['org.apache.pulsar.broker.authentication.AuthenticationProviderTls'] + +``` + +For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasBrokerSectionName` +under `properties` if needed. + +``` + +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasBrokerSectionName: Broker + +``` + +For *Token Authentication* provider, add necessary settings for `properties` if needed. +See [Token Authentication](security-jwt.md) for more details. +Note: key files must be DER-encoded + +``` + +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file:///path/to/public.key + +``` + +##### Enable Authorization Provider + +To enable authorization on Functions Worker, you need to configure `authorizationEnabled`, `authorizationProvider` and `configurationMetadataStoreUrl`. The authentication provider connects to `configurationMetadataStoreUrl` to receive namespace policies. + +```yaml + +authorizationEnabled: true +authorizationProvider: org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider +configurationMetadataStoreUrl: : + +``` + +You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. + +```yaml + +superUserRoles: + - role1 + - role2 + - role3 + +``` + +##### Enable End-to-End Encryption + +You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +To enable End-to-End encryption on Functions Worker, you can set it by specifying `--producer-config` in the command line terminal, for more information, please refer to [here](security-encryption.md). + +We include the relevant configuration information of `CryptoConfig` into `ProducerConfig`. The specific configurable field information about `CryptoConfig` is as follows: + +```text + +public class CryptoConfig { + private String cryptoKeyReaderClassName; + private Map cryptoKeyReaderConfig; + + private String[] encryptionKeys; + private ProducerCryptoFailureAction producerCryptoFailureAction; + + private ConsumerCryptoFailureAction consumerCryptoFailureAction; +} + +``` + +- `producerCryptoFailureAction`: define the action if producer fail to encrypt data one of `FAIL`, `SEND`. +- `consumerCryptoFailureAction`: define the action if consumer fail to decrypt data one of `FAIL`, `DISCARD`, `CONSUME`. + +#### BookKeeper Authentication + +If authentication is enabled on the BookKeeper cluster, you need configure the BookKeeper authentication settings as follows: + +- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. + +### Start Functions-worker + +Once you have finished configuring the `functions_worker.yml` configuration file, you can start a `functions-worker` in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +bin/pulsar-daemon start functions-worker + +``` + +You can also start `functions-worker` in the foreground by using `pulsar` CLI tool: + +```bash + +bin/pulsar functions-worker + +``` + +### Configure Proxies for Functions-workers + +When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served +by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. +Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. + +In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. + +If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](administration-proxy.md) to start proxies. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated-proxy.png) + +To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: + +```conf + +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= + +``` + +## Compare the Run-with-Broker and Run-separately modes + +As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. + +Use which mode for your cases, refer to the following guidelines to determine. + +Use the `Run-with-Broker` mode in the following cases: +- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; +- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). + +Use the `Run-separately` mode in the following cases: +- a) you don't have a Kubernetes cluster; +- b) if you want to run functions and brokers separately. + +## Troubleshooting + +**Error message: Namespace missing local cluster name in clusters list** + +``` + +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] + +``` + +The error message prompts when either of the cases occurs: +- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; +- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happens, follow the instructions below to fix the problem: + +1. Disable Functions Worker by setting `functionsWorkerEnabled=false`, and restart brokers. + +2. Get the current clusters list of `public/functions` namespace. + +```bash + +bin/pulsar-admin namespaces get-clusters public/functions + +``` + +3. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. + +```bash + +bin/pulsar-admin namespaces set-clusters --clusters , public/functions + +``` + +4. After setting the cluster successfully, enable functions worker by setting `functionsWorkerEnabled=true`. + +5. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file, and restart brokers. diff --git a/site2/website/versioned_docs/version-2.10.x/getting-started-concepts-and-architecture.md b/site2/website/versioned_docs/version-2.10.x/getting-started-concepts-and-architecture.md new file mode 100644 index 0000000000000..fe9c3fbc553b2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/getting-started-concepts-and-architecture.md @@ -0,0 +1,16 @@ +--- +id: concepts-architecture +title: Pulsar concepts and architecture +sidebar_label: "Concepts and architecture" +original_id: concepts-architecture +--- + + + + + + + + + + diff --git a/site2/website/versioned_docs/version-2.10.x/getting-started-docker.md b/site2/website/versioned_docs/version-2.10.x/getting-started-docker.md new file mode 100644 index 0000000000000..441a7b897278f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/getting-started-docker.md @@ -0,0 +1,219 @@ +--- +id: getting-started-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: "Run Pulsar in Docker" +original_id: getting-started-docker +--- + +For local development and testing, you can run Pulsar in standalone mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) and follow the instructions for your OS. + +## Start Pulsar in Docker + +For macOS, Linux, and Windows, run the following command to start Pulsar within a Docker container. + +```shell + +$ docker run -it -p 6650:6650 -p 8080:8080 --mount source=pulsardata,target=/pulsar/data --mount source=pulsarconf,target=/pulsar/conf apachepulsar/pulsar:@pulsar:version@ bin/pulsar standalone + +``` + +If you want to change Pulsar configurations and start Pulsar, run the following command by passing environment variables with the `PULSAR_PREFIX_` prefix. See [default configuration file](https://github.com/apache/pulsar/blob/e6b12c64b043903eb5ff2dc5186fe8030f157cfc/conf/standalone.conf) for more details. + +```shell + +$ docker run -it -e PULSAR_PREFIX_xxx=yyy -p 6650:6650 -p 8080:8080 --mount source=pulsardata,target=/pulsar/data --mount source=pulsarconf,target=/pulsar/conf apachepulsar/pulsar:2.10.0 sh -c "bin/apply-config-from-env.py conf/standalone.conf && bin/pulsar standalone" + +``` + +:::tip + +* The docker container runs as UID 10000 and GID 0 by default. You need to ensure the mounted volumes give write permission to either UID 10000 or GID 0. Note that UID 10000 is arbitrary, so it is recommended to make these mounts writable for the root group (GID 0). +* The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every time the container is restarted. For details on the volumes, you can use `docker volume inspect `. +* For Docker on Windows, make sure to configure it to use Linux containers. + +::: + +After starting Pulsar successfully, you can see `INFO`-level log messages like this: + +``` + +08:18:30.970 [main] INFO org.apache.pulsar.broker.web.WebService - HTTP Service started at http://0.0.0.0:8080 +... +07:53:37.322 [main] INFO org.apache.pulsar.broker.PulsarService - messaging service is ready, bootstrap service port = 8080, broker url= pulsar://localhost:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@98b63c1 +... + +``` + +:::tip + +When you start a local standalone cluster, a `public/default` namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +::: + +## Use Pulsar in Docker + +Pulsar offers a variety of [client libraries](client-libraries.md), such as [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md), [C++](client-libraries-cpp.md). + +If you're running a local standalone cluster, you can use one of these root URLs to interact with your cluster: +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example guides you to get started with Pulsar by using the [Python client API](client-libraries-python.md) client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell + +$ pip install pulsar-client + +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() + +``` + +### Produce a message + +Start a producer to send some test messages: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() + +``` + +## Get the topic statistics + +In Pulsar, you can use REST API, Java, or command-line tools to control every aspect of the system. For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell + +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool + +``` + +The output is something like this: + +```json + +{ + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesInCounter": 7097, + "msgInCounter": 143, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "averageMsgSize": 0.0, + "msgChunkPublished": false, + "storageSize": 7097, + "backlogSize": 0, + "offloadedStorageSize": 0, + "publishers": [ + { + "accessMode": "Shared", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "averageMsgSize": 0.0, + "chunkedMessageRate": 0.0, + "producerId": 0, + "metadata": {}, + "address": "/127.0.0.1:35604", + "connectedSince": "2021-07-04T09:05:43.04788Z", + "clientVersion": "2.8.0", + "producerName": "standalone-2-5" + } + ], + "waitingPublishers": 0, + "subscriptions": { + "my-sub": { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0, + "msgBacklog": 0, + "backlogSize": 0, + "msgBacklogNoDelayed": 0, + "blockedSubscriptionOnUnackedMsgs": false, + "msgDelayed": 0, + "unackedMessages": 0, + "type": "Exclusive", + "activeConsumerName": "3c544f1daa", + "msgRateExpired": 0.0, + "totalMsgExpired": 0, + "lastExpireTimestamp": 0, + "lastConsumedFlowTimestamp": 1625389101290, + "lastConsumedTimestamp": 1625389546070, + "lastAckedTimestamp": 1625389546162, + "lastMarkDeleteAdvancedTimestamp": 1625389546163, + "consumers": [ + { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0.0, + "consumerName": "3c544f1daa", + "availablePermits": 867, + "unackedMessages": 0, + "avgMessagesPerEntry": 6, + "blockedConsumerOnUnackedMsgs": false, + "lastAckedTimestamp": 1625389546162, + "lastConsumedTimestamp": 1625389546070, + "metadata": {}, + "address": "/127.0.0.1:35472", + "connectedSince": "2021-07-04T08:58:21.287682Z", + "clientVersion": "2.8.0" + } + ], + "isDurable": true, + "isReplicated": false, + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition": {}, + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0, + "durable": true, + "replicated": false + } + }, + "replication": {}, + "deduplicationStatus": "Disabled", + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0 +} + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/getting-started-helm.md b/site2/website/versioned_docs/version-2.10.x/getting-started-helm.md new file mode 100644 index 0000000000000..5d5401cc86a08 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/getting-started-helm.md @@ -0,0 +1,447 @@ +--- +id: getting-started-helm +title: Get started in Kubernetes +sidebar_label: "Run Pulsar in Kubernetes" +original_id: getting-started-helm +--- + +This section guides you through every step of installing and running Apache Pulsar with Helm on Kubernetes quickly, including the following sections: + +- Install the Apache Pulsar on Kubernetes using Helm +- Start and stop Apache Pulsar +- Create topics using `pulsar-admin` +- Produce and consume messages using Pulsar clients +- Monitor Apache Pulsar status with Prometheus and Grafana + +For deploying a Pulsar cluster for production usage, read the documentation on [how to configure and install a Pulsar Helm chart](helm-deploy.md). + +## Prerequisite + +- Kubernetes server 1.14.0+ +- kubectl 1.14.0+ +- Helm 3.0+ + +:::tip + +For the following steps, step 2 and step 3 are for **developers** and step 4 and step 5 are for **administrators**. + +::: + +## Step 0: Prepare a Kubernetes cluster + +Before installing a Pulsar Helm chart, you have to create a Kubernetes cluster. You can follow [the instructions](helm-prepare.md) to prepare a Kubernetes cluster. + +We use [Minikube](https://minikube.sigs.k8s.io/docs/start/) in this quick start guide. To prepare a Kubernetes cluster, follow these steps: + +1. Create a Kubernetes cluster on Minikube. + + ```bash + + minikube start --memory=8192 --cpus=4 --kubernetes-version= + + ``` + + The `` can be any [Kubernetes version supported by your Minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/), such as `v1.16.1`. + +2. Set `kubectl` to use Minikube. + + ```bash + + kubectl config use-context minikube + + ``` + +3. To use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with the local Kubernetes cluster on Minikube, enter the command below: + + ```bash + + minikube dashboard + + ``` + + The command automatically triggers opening a webpage in your browser. + +## Step 1: Install Pulsar Helm chart + +1. Add Pulsar charts repo. + + ```bash + + helm repo add apache https://pulsar.apache.org/charts + + ``` + + ```bash + + helm repo update + + ``` + +2. Clone the Pulsar Helm chart repository. + + ```bash + + git clone https://github.com/apache/pulsar-helm-chart + cd pulsar-helm-chart + + ``` + +3. Run the script `prepare_helm_release.sh` to create secrets required for installing the Apache Pulsar Helm chart. The username `pulsar` and password `pulsar` are used for logging into the Grafana dashboard and Pulsar Manager. + + :::note + + When running the script, you can use `-n` to specify the Kubernetes namespace where the Pulsar Helm chart is installed, `-k` to define the Pulsar Helm release name, and `-c` to create the Kubernetes namespace. For more information about the script, run `./scripts/pulsar/prepare_helm_release.sh --help`. + + ::: + + ```bash + + ./scripts/pulsar/prepare_helm_release.sh \ + -n pulsar \ + -k pulsar-mini \ + -c + + ``` + +4. Use the Pulsar Helm chart to install a Pulsar cluster to Kubernetes. + + :::note + + You need to specify `--set initialize=true` when installing Pulsar the first time. This command installs and starts Apache Pulsar. + + ::: + + ```bash + + helm install \ + --values examples/values-minikube.yaml \ + --set initialize=true \ + --namespace pulsar \ + pulsar-mini apache/pulsar + + ``` + +5. Check the status of all pods. + + ```bash + + kubectl get pods -n pulsar + + ``` + + If all pods start up successfully, you can see that the `STATUS` is changed to `Running` or `Completed`. + + **Output** + + ```bash + + NAME READY STATUS RESTARTS AGE + pulsar-mini-bookie-0 1/1 Running 0 9m27s + pulsar-mini-bookie-init-5gphs 0/1 Completed 0 9m27s + pulsar-mini-broker-0 1/1 Running 0 9m27s + pulsar-mini-grafana-6b7bcc64c7-4tkxd 1/1 Running 0 9m27s + pulsar-mini-prometheus-5fcf5dd84c-w8mgz 1/1 Running 0 9m27s + pulsar-mini-proxy-0 1/1 Running 0 9m27s + pulsar-mini-pulsar-init-t7cqt 0/1 Completed 0 9m27s + pulsar-mini-pulsar-manager-9bcbb4d9f-htpcs 1/1 Running 0 9m27s + pulsar-mini-toolset-0 1/1 Running 0 9m27s + pulsar-mini-zookeeper-0 1/1 Running 0 9m27s + + ``` + +6. Check the status of all services in the namespace `pulsar`. + + ```bash + + kubectl get services -n pulsar + + ``` + + **Output** + + ```bash + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + pulsar-mini-bookie ClusterIP None 3181/TCP,8000/TCP 11m + pulsar-mini-broker ClusterIP None 8080/TCP,6650/TCP 11m + pulsar-mini-grafana LoadBalancer 10.106.141.246 3000:31905/TCP 11m + pulsar-mini-prometheus ClusterIP None 9090/TCP 11m + pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 11m + pulsar-mini-pulsar-manager LoadBalancer 10.103.192.175 9527:30190/TCP 11m + pulsar-mini-toolset ClusterIP None 11m + pulsar-mini-zookeeper ClusterIP None 2888/TCP,3888/TCP,2181/TCP 11m + + ``` + +## Step 2: Use pulsar-admin to create Pulsar tenants/namespaces/topics + +`pulsar-admin` is the CLI (command-Line Interface) tool for Pulsar. In this step, you can use `pulsar-admin` to create resources, including tenants, namespaces, and topics. + +1. Enter the `toolset` container. + + ```bash + + kubectl exec -it -n pulsar pulsar-mini-toolset-0 -- /bin/bash + + ``` + +2. In the `toolset` container, create a tenant named `apache`. + + ```bash + + bin/pulsar-admin tenants create apache + + ``` + + Then you can list the tenants to see if the tenant is created successfully. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + You should see a similar output as below. The tenant `apache` has been successfully created. + + ```bash + + "apache" + "public" + "pulsar" + + ``` + +3. In the `toolset` container, create a namespace named `pulsar` in the tenant `apache`. + + ```bash + + bin/pulsar-admin namespaces create apache/pulsar + + ``` + + Then you can list the namespaces of tenant `apache` to see if the namespace is created successfully. + + ```bash + + bin/pulsar-admin namespaces list apache + + ``` + + You should see a similar output as below. The namespace `apache/pulsar` has been successfully created. + + ```bash + + "apache/pulsar" + + ``` + +4. In the `toolset` container, create a topic `test-topic` with `4` partitions in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics create-partitioned-topic apache/pulsar/test-topic -p 4 + + ``` + +5. In the `toolset` container, list all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics list-partitioned-topics apache/pulsar + + ``` + + Then you can see all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + "persistent://apache/pulsar/test-topic" + + ``` + +## Step 3: Use Pulsar client to produce and consume messages + +You can use the Pulsar client to create producers and consumers to produce and consume messages. + +By default, the Pulsar Helm chart exposes the Pulsar cluster through a Kubernetes `LoadBalancer`. In Minikube, you can use the following command to check the proxy service. + +```bash + +kubectl get services -n pulsar | grep pulsar-mini-proxy + +``` + +You will see a similar output as below. + +```bash + +pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 28m + +``` + +This output tells what are the node ports that Pulsar cluster's binary port and HTTP port are mapped to. The port after `80:` is the HTTP port while the port after `6650:` is the binary port. + +Then you can find the IP address and exposed ports of your Minikube server by running the following command. + +```bash + +minikube service pulsar-mini-proxy -n pulsar + +``` + +**Output** + +```bash + +|-----------|-------------------|-------------|-------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|-------------------------| +| pulsar | pulsar-mini-proxy | http/80 | http://172.17.0.4:32305 | +| | | pulsar/6650 | http://172.17.0.4:31816 | +|-----------|-------------------|-------------|-------------------------| +🏃 Starting tunnel for service pulsar-mini-proxy. +|-----------|-------------------|-------------|------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|------------------------| +| pulsar | pulsar-mini-proxy | | http://127.0.0.1:61853 | +| | | | http://127.0.0.1:61854 | +|-----------|-------------------|-------------|------------------------| + +``` + +At this point, you can get the service URLs to connect to your Pulsar client. Here are URL examples: + +``` + +webServiceUrl=http://127.0.0.1:61853/ +brokerServiceUrl=pulsar://127.0.0.1:61854/ + +``` + +Then you can proceed with the following steps: + +1. Download the Apache Pulsar tarball from the [downloads page](/download/). + +2. Decompress the tarball based on your download file. + + ```bash + + tar -xf .tar.gz + + ``` + +3. Expose `PULSAR_HOME`. + + (1) Enter the directory of the decompressed download file. + + (2) Expose `PULSAR_HOME` as the environment variable. + + ```bash + + export PULSAR_HOME=$(pwd) + + ``` + +4. Configure the Pulsar client. + + In the `${PULSAR_HOME}/conf/client.conf` file, replace `webServiceUrl` and `brokerServiceUrl` with the service URLs you get from the above steps. + +5. Create a subscription to consume messages from `apache/pulsar/test-topic`. + + ```bash + + bin/pulsar-client consume -s sub apache/pulsar/test-topic -n 0 + + ``` + +6. Open a new terminal. In the new terminal, create a producer and send 10 messages to the `test-topic` topic. + + ```bash + + bin/pulsar-client produce apache/pulsar/test-topic -m "---------hello apache pulsar-------" -n 10 + + ``` + +7. Verify the results. + + - From the producer side + + **Output** + + The messages have been produced successfully. + + ```bash + + 18:15:15.489 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 10 messages successfully produced + + ``` + + - From the consumer side + + **Output** + + At the same time, you can receive the messages as below. + + ```bash + + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + + ``` + +## Step 4: Use Pulsar Manager to manage the cluster + +[Pulsar Manager](administration-pulsar-manager.md) is a web-based GUI management tool for managing and monitoring Pulsar. + +1. By default, the `Pulsar Manager` is exposed as a separate `LoadBalancer`. You can open the Pulsar Manager UI using the following command: + + ```bash + + minikube service -n pulsar pulsar-mini-pulsar-manager + + ``` + +2. The Pulsar Manager UI will be open in your browser. You can use the username `pulsar` and password `pulsar` to log into Pulsar Manager. + +3. In Pulsar Manager UI, you can create an environment. + + - Click `New Environment` button in the top-left corner. + - Type `pulsar-mini` for the field `Environment Name` in the popup window. + - Type `http://pulsar-mini-broker:8080` for the field `Service URL` in the popup window. + - Click `Confirm` button in the popup window. + +4. After successfully creating an environment, you are redirected to the `tenants` page of that environment. Then you can create `tenants`, `namespaces` and `topics` using the Pulsar Manager. + +## Step 5: Use Prometheus and Grafana to monitor cluster + +Grafana is an open-source visualization tool, which can be used for visualizing time series data into dashboards. + +1. By default, the Grafana is exposed as a separate `LoadBalancer`. You can open the Grafana UI using the following command: + + ```bash + + minikube service pulsar-mini-grafana -n pulsar + + ``` + +2. The Grafana UI is open in your browser. You can use the username `pulsar` and password `pulsar` to log into the Grafana Dashboard. + +3. You can view dashboards for different components of a Pulsar cluster. diff --git a/site2/website/versioned_docs/version-2.10.x/getting-started-pulsar.md b/site2/website/versioned_docs/version-2.10.x/getting-started-pulsar.md new file mode 100644 index 0000000000000..752590f57b558 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/getting-started-pulsar.md @@ -0,0 +1,72 @@ +--- +id: pulsar-2.0 +title: Pulsar 2.0 +sidebar_label: "Pulsar 2.0" +original_id: pulsar-2.0 +--- + +Pulsar 2.0 is a major new release for Pulsar that brings some bold changes to the platform, including [simplified topic names](#topic-names), the addition of the [Pulsar Functions](functions-overview.md) feature, some terminology changes, and more. + +## New features in Pulsar 2.0 + +Feature | Description +:-------|:----------- +[Pulsar Functions](functions-overview.md) | A lightweight compute option for Pulsar + +## Major changes + +There are a few major changes that you should be aware of, as they may significantly impact your day-to-day usage. + +### Properties versus tenants + +Previously, Pulsar had a concept of properties. A property is essentially the exact same thing as a tenant, so the "property" terminology has been removed in version 2.0. The [`pulsar-admin properties`](reference-pulsar-admin.md#pulsar-admin) command-line interface, for example, has been replaced with the [`pulsar-admin tenants`](reference-pulsar-admin.md#pulsar-admin-tenants) interface. In some cases the properties terminology is still used but is now considered deprecated and will be removed entirely in a future release. + +### Topic names + +Prior to version 2.0, *all* Pulsar topics had the following form: + +```http + +{persistent|non-persistent}://property/cluster/namespace/topic + +``` + +Two important changes have been made in Pulsar 2.0: + +* There is no longer a [cluster component](#no-cluster) +* Properties have been [renamed to tenants](#tenants) +* You can use a [flexible](#flexible-topic-naming) naming system to shorten many topic names +* `/` is not allowed in topic name + +#### No cluster component + +The cluster component has been removed from topic names. Thus, all topic names now have the following form: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +> Existing topics that use the legacy name format will continue to work without any change, and there are no plans to change that. + + +#### Flexible topic naming + +All topic names in Pulsar 2.0 internally have the form shown [above](#no-cluster-component) but you can now use shorthand names in many cases (for the sake of simplicity). The flexible naming system stems from the fact that there is now a default topic type, tenant, and namespace: + +Topic aspect | Default +:------------|:------- +topic type | `persistent` +tenant | `public` +namespace | `default` + +The table below shows some example topic name translations that use implicit defaults: + +Input topic name | Translated topic name +:----------------|:--------------------- +`my-topic` | `persistent://public/default/my-topic` +`my-tenant/my-namespace/my-topic` | `persistent://my-tenant/my-namespace/my-topic` + +> For [non-persistent topics](concepts-messaging.md#non-persistent-topics) you'll need to continue to specify the entire topic name, as the default-based rules for persistent topic names don't apply. Thus you cannot use a shorthand name like `non-persistent://my-topic` and would need to use `non-persistent://public/default/my-topic` instead + diff --git a/site2/website/versioned_docs/version-2.10.x/getting-started-standalone.md b/site2/website/versioned_docs/version-2.10.x/getting-started-standalone.md new file mode 100644 index 0000000000000..f3688c496d58d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/getting-started-standalone.md @@ -0,0 +1,326 @@ +--- +id: getting-started-standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: getting-started-standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary [RocksDB](http://rocksdb.org/) and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> **Pulsar in production?** +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of installing Pulsar locally. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +#### Install JDK on M1 +In the current version, Pulsar uses a BookKeeper version which in turn uses RocksDB. RocksDB is compiled to work on x86 architecture and not ARM. Therefore, Pulsar can only work with x86 JDK. This is planned to be fixed in future versions of Pulsar. + +One of the ways to easily install an x86 JDK is to use [SDKMan](http://sdkman.io) as outlined in the following steps: + +1. Install [SDKMan](http://sdkman.io). + + * Method 1: follow instructions on the SDKMan website. + + * Method 2: if you have [Homebrew](https://brew.sh) installed, enter the following command. + +```shell + +brew install sdkman + +``` + +2. Turn on Rosetta2 compatibility for SDKMan by editing `~/.sdkman/etc/config` and changing the following property from `false` to `true`. + +```properties + +sdkman_rosetta2_compatible=true + +``` + +3. Close the current shell / terminal window and open a new one. +4. Make sure you don't have any previously installed JVM of the same version by listing existing installed versions. + +```shell + +sdk list java|grep installed + +``` + +Example output: + +```text + + | >>> | 17.0.3.6.1 | amzn | installed | 17.0.3.6.1-amzn + +``` + +If you have any Java 17 version installed, uninstall it. + +```shell + +sdk uinstall java 17.0.3.6.1 + +``` + +5. Install any Java versions greater than Java 8. + +```shell + + sdk install java 17.0.3.6.1-amzn + +``` + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker) and more.
    **Note:** Pulsar standalone uses RocksDB as the local metadata store and its configuration file path [`metadataStoreConfigPath`](reference-configuration.md) is configurable in the `standalone.conf` file. For more information about the configurations of RocksDB, see [here](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini) and related [documentation](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide). +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by RocksDB and BookKeeper. +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +- Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +- To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or DC/OS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +21:59:29.327 [DLM-/stream/storage-OrderedScheduler-3-0] INFO org.apache.bookkeeper.stream.storage.impl.sc.StorageContainerImpl - Successfully started storage container (0). +21:59:34.576 [main] INFO org.apache.pulsar.broker.authentication.AuthenticationService - Authentication is disabled +21:59:34.576 [main] INFO org.apache.pulsar.websocket.WebSocketService - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `bin/pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](reference-cli-tools.md#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:17:16.781 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully consumed + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:21:08.693 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `bin/pulsar-daemon start standalone` command, then use the `bin/pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](reference-cli-tools.md#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/helm-deploy.md b/site2/website/versioned_docs/version-2.10.x/helm-deploy.md new file mode 100644 index 0000000000000..0e7815e4f4d90 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-deploy.md @@ -0,0 +1,434 @@ +--- +id: helm-deploy +title: Deploy Pulsar cluster using Helm +sidebar_label: "Deployment" +original_id: helm-deploy +--- + +Before running `helm install`, you need to decide how to run Pulsar. +Options can be specified using Helm's `--set option.name=value` command line option. + +## Select configuration options + +In each section, collect the options that are combined to use with the `helm install` command. + +### Kubernetes namespace + +By default, the Pulsar Helm chart is installed to a namespace called `pulsar`. + +```yaml + +namespace: pulsar + +``` + +To install the Pulsar Helm chart into a different Kubernetes namespace, you can include this option in the `helm install` command. + +```bash + +--set namespace= + +``` + +By default, the Pulsar Helm chart doesn't create the namespace. + +```yaml + +namespaceCreate: false + +``` + +To use the Pulsar Helm chart to create the Kubernetes namespace automatically, you can include this option in the `helm install` command. + +```bash + +--set namespaceCreate=true + +``` + +### Persistence + +By default, the Pulsar Helm chart creates Volume Claims with the expectation that a dynamic provisioner creates the underlying Persistent Volumes. + +```yaml + +volumes: + persistence: true + # configure the components to use local persistent volume + # the local provisioner should be installed prior to enable local persistent volume + local_storage: false + +``` + +To use local persistent volumes as the persistent storage for Helm release, you can install the [local storage provisioner](#install-local-storage-provisioner) and include the following option in the `helm install` command. + +```bash + +--set volumes.local_storage=true + +``` + +:::note + +Before installing the production instance of Pulsar, ensure to plan the storage settings to avoid extra storage migration work. Because after initial installation, you must edit Kubernetes objects manually if you want to change storage settings. + +::: + +The Pulsar Helm chart is designed for production use. To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable persistence by including this option in your `helm install` command. + +```bash + +--set volumes.persistence=false + +``` + +### Affinity + +By default, `anti-affinity` is enabled to ensure pods of the same component can run on different nodes. + +```yaml + +affinity: + anti_affinity: true + +``` + +To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable `anti-affinity` by including this option in your `helm install` command. + +```bash + +--set affinity.anti_affinity=false + +``` + +### Components + +The Pulsar Helm chart is designed for production usage. It deploys a production-ready Pulsar cluster, including Pulsar core components and monitoring components. + +You can customize the components to be deployed by turning on/off individual components. + +```yaml + +## Components +## +## Control what components of Apache Pulsar to deploy for the cluster +components: + # zookeeper + zookeeper: true + # bookkeeper + bookkeeper: true + # bookkeeper - autorecovery + autorecovery: true + # broker + broker: true + # functions + functions: true + # proxy + proxy: true + # toolset + toolset: true + # pulsar manager + pulsar_manager: true + +## Monitoring Components +## +## Control what components of the monitoring stack to deploy for the cluster +monitoring: + # monitoring - prometheus + prometheus: true + # monitoring - grafana + grafana: true + +``` + +### Docker images + +The Pulsar Helm chart is designed to enable controlled upgrades. So it can configure independent image versions for components. You can customize the images by setting individual component. + +```yaml + +## Images +## +## Control what images to use for each component +images: + zookeeper: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + bookie: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + autorecovery: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + broker: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + proxy: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + functions: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + prometheus: + repository: prom/prometheus + tag: v1.6.3 + pullPolicy: IfNotPresent + grafana: + repository: streamnative/apache-pulsar-grafana-dashboard-k8s + tag: 0.0.4 + pullPolicy: IfNotPresent + pulsar_manager: + repository: apachepulsar/pulsar-manager + tag: v0.1.0 + pullPolicy: IfNotPresent + hasCommand: false + +``` + +### TLS + +The Pulsar Helm chart can be configured to enable TLS (Transport Layer Security) to protect all the traffic between components. Before enabling TLS, you have to provision TLS certificates for the required components. + +#### Provision TLS certificates using cert-manager + +To use the `cert-manager` to provision the TLS certificates, you have to install the [cert-manager](#install-cert-manager) before installing the Pulsar Helm chart. After successfully installing the cert-manager, you can set `certs.internal_issuer.enabled` to `true`. Therefore, the Pulsar Helm chart can use the `cert-manager` to generate `selfsigning` TLS certificates for the configured components. + +```yaml + +certs: + internal_issuer: + enabled: false + component: internal-cert-issuer + type: selfsigning + +``` + +You can also customize the generated TLS certificates by configuring the fields as the following. + +```yaml + +tls: + # common settings for generating certs + common: + # 90d + duration: 2160h + # 15d + renewBefore: 360h + organization: + - pulsar + keySize: 4096 + keyAlgorithm: rsa + keyEncoding: pkcs8 + +``` + +#### Enable TLS + +After installing the `cert-manager`, you can set `tls.enabled` to `true` to enable TLS encryption for the entire cluster. + +```yaml + +tls: + enabled: false + +``` + +You can also configure whether to enable TLS encryption for individual component. + +```yaml + +tls: + # settings for generating certs for proxy + proxy: + enabled: false + cert_name: tls-proxy + # settings for generating certs for broker + broker: + enabled: false + cert_name: tls-broker + # settings for generating certs for bookies + bookie: + enabled: false + cert_name: tls-bookie + # settings for generating certs for zookeeper + zookeeper: + enabled: false + cert_name: tls-zookeeper + # settings for generating certs for recovery + autorecovery: + cert_name: tls-recovery + # settings for generating certs for toolset + toolset: + cert_name: tls-toolset + +``` + +### Authentication + +By default, authentication is disabled. You can set `auth.authentication.enabled` to `true` to enable authentication. +Currently, the Pulsar Helm chart only supports JWT authentication provider. You can set `auth.authentication.provider` to `jwt` to use the JWT authentication provider. + +```yaml + +# Enable or disable broker authentication and authorization. +auth: + authentication: + enabled: false + provider: "jwt" + jwt: + # Enable JWT authentication + # If the token is generated by a secret key, set the usingSecretKey as true. + # If the token is generated by a private key, set the usingSecretKey as false. + usingSecretKey: false + superUsers: + # broker to broker communication + broker: "broker-admin" + # proxy to broker communication + proxy: "proxy-admin" + # pulsar-admin client to broker/proxy communication + client: "admin" + +``` + +To enable authentication, you can run [prepare helm release](#prepare-the-helm-release) to generate token secret keys and tokens for three super users specified in the `auth.superUsers` field. The generated token keys and super user tokens are uploaded and stored as Kubernetes secrets prefixed with `-token-`. You can use the following command to find those secrets. + +```bash + +kubectl get secrets -n + +``` + +### Authorization + +By default, authorization is disabled. Authorization can be enabled only when authentication is enabled. + +```yaml + +auth: + authorization: + enabled: false + +``` + +To enable authorization, you can include this option in the `helm install` command. + +```bash + +--set auth.authorization.enabled=true + +``` + +### CPU and RAM resource requirements + +By default, the resource requests and the number of replicas for the Pulsar components in the Pulsar Helm chart are adequate for a small production deployment. If you deploy a non-production instance, you can reduce the defaults to fit into a smaller cluster. + +Once you have all of your configuration options collected, you can install dependent charts before installing the Pulsar Helm chart. + +## Install dependent charts + +### Install local storage provisioner + +To use local persistent volumes as the persistent storage, you need to install a storage provisioner for [local persistent volumes](https://kubernetes.io/blog/2019/04/04/kubernetes-1.14-local-persistent-volumes-ga/). + +One of the easiest way to get started is to use the local storage provisioner provided along with the Pulsar Helm chart. + +``` + +helm repo add streamnative https://charts.streamnative.io +helm repo update +helm install pulsar-storage-provisioner streamnative/local-storage-provisioner + +``` + +### Install cert-manager + +The Pulsar Helm chart uses the [cert-manager](https://github.com/jetstack/cert-manager) to provision and manage TLS certificates automatically. To enable TLS encryption for brokers or proxies, you need to install the cert-manager in advance. + +For details about how to install the cert-manager, follow the [official instructions](https://cert-manager.io/docs/installation/kubernetes/#installing-with-helm). + +Alternatively, we provide a bash script [install-cert-manager.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/cert-manager/install-cert-manager.sh) to install a cert-manager release to the namespace `cert-manager`. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/cert-manager/install-cert-manager.sh + +``` + +## Prepare Helm release + +Once you have install all the dependent charts and collected all of your configuration options, you can run [prepare_helm_release.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh) to prepare the Helm release. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/pulsar/prepare_helm_release.sh -n -k + +``` + +The `prepare_helm_release` creates the following resources: + +- A Kubernetes namespace for installing the Pulsar release +- JWT secret keys and tokens for three super users: `broker-admin`, `proxy-admin`, and `admin`. By default, it generates an asymmetric pubic/private key pair. You can choose to generate a symmetric secret key by specifying `--symmetric`. + - `proxy-admin` role is used for proxies to communicate to brokers. + - `broker-admin` role is used for inter-broker communications. + - `admin` role is used by the admin tools. + +## Deploy Pulsar cluster using Helm + +Once you have finished the following three things, you can install a Helm release. + +- Collect all of your configuration options. +- Install dependent charts. +- Prepare the Helm release. + +In this example, the Helm release is named `pulsar`. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm install pulsar apache/pulsar \ + --timeout 10m \ + --set initialize=true \ + --set [your configuration options] + +``` + +:::note + +For the first deployment, add `--set initialize=true` option to initialize bookie and Pulsar cluster metadata. + +::: + +You can also use the `--version ` option if you want to install a specific version of Pulsar Helm chart. + +## Monitor deployment + +A list of installed resources are output once the Pulsar cluster is deployed. This may take 5-10 minutes. + +The status of the deployment can be checked by running the `helm status pulsar` command, which can also be done while the deployment is taking place if you run the command in another terminal. + +## Access Pulsar cluster + +The default values will create a `ClusterIP` for the following resources, which you can use to interact with the cluster. + +- Proxy: You can use the IP address to produce and consume messages to the installed Pulsar cluster. +- Pulsar Manager: You can access the Pulsar Manager UI at `http://:9527`. +- Grafana Dashboard: You can access the Grafana dashboard at `http://:3000`. + +To find the IP addresses of those components, run the following command: + +```bash + +kubectl get service -n + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/helm-install.md b/site2/website/versioned_docs/version-2.10.x/helm-install.md new file mode 100644 index 0000000000000..9f81f52e0dab1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-install.md @@ -0,0 +1,38 @@ +--- +id: helm-install +title: Install Apache Pulsar using Helm +sidebar_label: "Install" +original_id: helm-install +--- + +Install Apache Pulsar on Kubernetes with the official Pulsar Helm chart. + +## Requirements + +To deploy Apache Pulsar on Kubernetes, the followings are required. + +- kubectl 1.14 or higher, compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)) +- Helm v3 (3.0.2 or higher) +- A Kubernetes cluster, version 1.14 or higher + +## Environment setup + +Before deploying Pulsar, you need to prepare your environment. + +### Tools + +Install [`helm`](helm-tools.md) and [`kubectl`](helm-tools.md) on your computer. + +## Cloud cluster preparation + +To create and connect to the Kubernetes cluster, follow the instructions: + +- [Google Kubernetes Engine](helm-prepare.md#google-kubernetes-engine) + +## Pulsar deployment + +Once the environment is set up and configuration is generated, you can now proceed to the [deployment of Pulsar](helm-deploy.md). + +## Pulsar upgrade + +To upgrade an existing Kubernetes installation, follow the [upgrade documentation](helm-upgrade.md). diff --git a/site2/website/versioned_docs/version-2.10.x/helm-overview.md b/site2/website/versioned_docs/version-2.10.x/helm-overview.md new file mode 100644 index 0000000000000..125f595cbe68a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-overview.md @@ -0,0 +1,103 @@ +--- +id: helm-overview +title: Apache Pulsar Helm Chart +sidebar_label: "Overview" +original_id: helm-overview +--- + +[Helm chart](https://github.com/apache/pulsar-helm-chart) supports you to install Apache Pulsar in a cloud-native environment. + +## Introduction + +The Apache Pulsar Helm chart provides one of the most convenient ways to operate Pulsar on Kubernetes. With all the required components, Helm chart is scalable and thus being suitable for large-scale deployments. + +The Apache Pulsar Helm chart contains all components to support the features and functions that Pulsar delivers. You can install and configure these components separately. + +- Pulsar core components: + - ZooKeeper + - Bookies + - Brokers + - Function workers + - Proxies +- Control center: + - Pulsar Manager + - Prometheus + - Grafana + +Moreover, Helm chart supports: + +- Security + - Automatically provisioned TLS certificates, using [Jetstack](https://www.jetstack.io/)'s [cert-manager](https://cert-manager.io/docs/) + - self-signed + - [Let's Encrypt](https://letsencrypt.org/) + - TLS Encryption + - Proxy + - Broker + - Toolset + - Bookie + - ZooKeeper + - Authentication + - JWT + - Authorization +- Storage + - Non-persistence storage + - Persistent volume + - Local persistent volumes +- Functions + - Kubernetes Runtime + - Process Runtime + - Thread Runtime +- Operations + - Independent image versions for all components, enabling controlled upgrades + +## Quick start + +To run with Apache Pulsar Helm chart as fast as possible in a **non-production** use case, we provide a [quick start guide](getting-started-helm.md) for Proof of Concept (PoC) deployments. + +This guide walks you through deploying Apache Pulsar Helm chart with default values and features, but it is *not* suitable for deployments in production-ready environments. To deploy the charts in production under sustained load, you can follow the complete [Installation Guide](helm-install.md). + +## Troubleshooting + +Although we have done our best to make these charts as seamless as possible, troubles do go out of our control occasionally. We have been collecting tips and tricks for troubleshooting common issues. Please check it first before raising an [issue](https://github.com/apache/pulsar/issues/new/choose), and feel free to add your solutions by creating a [Pull Request](https://github.com/apache/pulsar/compare). + +## Installation + +The Apache Pulsar Helm chart contains all required dependencies. + +If you deploy a PoC for testing, we strongly suggest you follow this [Quick Start Guide](getting-started-helm.md) for your first iteration. + +1. [Preparation](helm-prepare.md) +2. [Deployment](helm-deploy.md) + +## Upgrading + +Once the Apache Pulsar Helm chart is installed, you can use `helm upgrade` command to configure and update it. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm get values > pulsar.yaml +helm upgrade apache/pulsar -f pulsar.yaml + +``` + +For more detailed information, see [Upgrading](helm-upgrade.md). + +## Uninstallation + +To uninstall the Apache Pulsar Helm chart, run the following command: + +```bash + +helm delete + +``` + +For the purposes of continuity, some Kubernetes objects in these charts cannot be removed by `helm delete` command. It is recommended to *consciously* remove these items, as they affect re-deployment. + +* PVCs for stateful data: remove these items. + - ZooKeeper: This is your metadata. + - BookKeeper: This is your data. + - Prometheus: This is your metrics data, which can be safely removed. +* Secrets: if the secrets are generated by the [prepare release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh), they contain secret keys and tokens. You can use the [cleanup release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/cleanup_helm_release.sh) to remove these secrets and tokens as needed. diff --git a/site2/website/versioned_docs/version-2.10.x/helm-prepare.md b/site2/website/versioned_docs/version-2.10.x/helm-prepare.md new file mode 100644 index 0000000000000..e5d56c7e95e34 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-prepare.md @@ -0,0 +1,80 @@ +--- +id: helm-prepare +title: Prepare Kubernetes resources +sidebar_label: "Prepare" +original_id: helm-prepare +--- + +For a fully functional Pulsar cluster, you need a few resources before deploying the Apache Pulsar Helm chart. The following provides instructions to prepare the Kubernetes cluster before deploying the Pulsar Helm chart. + +- [Google Kubernetes Engine](#google-kubernetes-engine) + - [Manual cluster creation](#manual-cluster-creation) + - [Scripted cluster creation](#scripted-cluster-creation) + - [Create cluster with local SSDs](#create-cluster-with-local-ssds) + +## Google Kubernetes Engine + +To get started easier, a script is provided to create the cluster automatically. Alternatively, a cluster can be created manually as well. + +### Manual cluster creation + +To provision a Kubernetes cluster manually, follow the [GKE instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster). + +### Scripted cluster creation + +A [bootstrap script](https://github.com/streamnative/charts/tree/master/scripts/pulsar/gke_bootstrap_script.sh) has been created to automate much of the setup process for users on GCP/GKE. + +The script can: + +1. Create a new GKE cluster. +2. Allow the cluster to modify DNS (Domain Name Server) records. +3. Setup `kubectl`, and connect it to the cluster. + +Google Cloud SDK is a dependency of this script, so ensure it is [set up correctly](helm-tools.md#connect-to-a-gke-cluster) for the script to work. + +The script reads various parameters from environment variables and an argument `up` or `down` for bootstrap and clean-up respectively. + +The following table describes all variables. + +| **Variable** | **Description** | **Default value** | +| ------------ | --------------- | ----------------- | +| PROJECT | ID of your GCP project | No default value. It requires to be set. | +| CLUSTER_NAME | Name of the GKE cluster | `pulsar-dev` | +| CONFDIR | Configuration directory to store Kubernetes configuration | ${HOME}/.config/streamnative | +| INT_NETWORK | IP space to use within this cluster | `default` | +| LOCAL_SSD_COUNT | Number of local SSD counts | 4 | +| MACHINE_TYPE | Type of machine to use for nodes | `n1-standard-4` | +| NUM_NODES | Number of nodes to be created in each of the cluster's zones | 4 | +| PREEMPTIBLE | Create nodes using preemptible VM instances in the new cluster. | false | +| REGION | Compute region for the cluster | `us-east1` | +| USE_LOCAL_SSD | Flag to create a cluster with local SSDs | false | +| ZONE | Compute zone for the cluster | `us-east1-b` | +| ZONE_EXTENSION | The extension (`a`, `b`, `c`) of the zone name of the cluster | `b` | +| EXTRA_CREATE_ARGS | Extra arguments passed to create command | | + +Run the script, by passing in your desired parameters. It can work with the default parameters except for `PROJECT` which is required: + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + +The script can also be used to clean up the created GKE resources. + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh down + +``` + +#### Create cluster with local SSDs + +To install the Pulsar Helm chart using local persistent volumes, you need to create a GKE cluster with local SSDs. You can do so by specifying `USE_LOCAL_SSD` to be `true` in the following command to create a Pulsar cluster with local SSDs. + +``` + +PROJECT= USE_LOCAL_SSD=true LOCAL_SSD_COUNT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/helm-tools.md b/site2/website/versioned_docs/version-2.10.x/helm-tools.md new file mode 100644 index 0000000000000..6ba89006913b6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-tools.md @@ -0,0 +1,43 @@ +--- +id: helm-tools +title: Required tools for deploying Pulsar Helm Chart +sidebar_label: "Required Tools" +original_id: helm-tools +--- + +Before deploying Pulsar to your Kubernetes cluster, there are some tools you must have installed locally. + +## kubectl + +kubectl is the tool that talks to the Kubernetes API. kubectl 1.14 or higher is required and it needs to be compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)). + +To Install kubectl locally, follow the [Kubernetes documentation](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl). + +The server version of kubectl cannot be obtained until we connect to a cluster. + +## Helm + +Helm is the package manager for Kubernetes. The Apache Pulsar Helm Chart is tested and supported with Helm v3. + +### Get Helm + +You can get Helm from the project's [releases page](https://github.com/helm/helm/releases), or follow other options under the official documentation of [installing Helm](https://helm.sh/docs/intro/install/). + +### Next steps + +Once kubectl and Helm are configured, you can configure your [Kubernetes cluster](helm-prepare.md). + +## Additional information + +### Templates + +Templating in Helm is done through Golang's [text/template](https://golang.org/pkg/text/template/) and [sprig](https://godoc.org/github.com/Masterminds/sprig). + +For more information about how all the inner workings behave, check these documents: + +- [Functions and Pipelines](https://helm.sh/docs/chart_template_guide/functions_and_pipelines/) +- [Subcharts and Globals](https://helm.sh/docs/chart_template_guide/subcharts_and_globals/) + +### Tips and tricks + +For additional information on developing with Helm, check [tips and tricks section](https://helm.sh/docs/howto/charts_tips_and_tricks/) in the Helm repository. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/helm-upgrade.md b/site2/website/versioned_docs/version-2.10.x/helm-upgrade.md new file mode 100644 index 0000000000000..7d671e6bfb3c1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/helm-upgrade.md @@ -0,0 +1,43 @@ +--- +id: helm-upgrade +title: Upgrade Pulsar Helm release +sidebar_label: "Upgrade" +original_id: helm-upgrade +--- + +Before upgrading your Pulsar installation, you need to check the change log corresponding to the specific release you want to upgrade to and look for any release notes that might pertain to the new Pulsar helm chart version. + +We also recommend that you need to provide all values using the `helm upgrade --set key=value` syntax or the `-f values.yml` instead of using `--reuse-values`, because some of the current values might be deprecated. + +:::note + +You can retrieve your previous `--set` arguments cleanly, with `helm get values `. If you direct this into a file (`helm get values > pulsar.yml`), you can safely pass this file through `-f`, namely `helm upgrade apache/pulsar -f pulsar.yaml`. This safely replaces the behavior of `--reuse-values`. + +::: + +## Steps + +To upgrade Apache Pulsar to a newer version, follow these steps: + +1. Check the change log for the specific version you would like to upgrade to. +2. Go through [deployment documentation](helm-deploy.md) step by step. +3. Extract your previous `--set` arguments with the following command. + + ```bash + + helm get values > pulsar.yaml + + ``` + +4. Decide all the values you need to set. +5. Perform the upgrade, with all `--set` arguments extracted in step 4. + + ```bash + + helm upgrade apache/pulsar \ + --version \ + -f pulsar.yaml \ + --set ... + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-aerospike-sink.md b/site2/website/versioned_docs/version-2.10.x/io-aerospike-sink.md new file mode 100644 index 0000000000000..63d7338a3ba91 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-aerospike-sink.md @@ -0,0 +1,26 @@ +--- +id: io-aerospike-sink +title: Aerospike sink connector +sidebar_label: "Aerospike sink connector" +original_id: io-aerospike-sink +--- + +The Aerospike sink connector pulls messages from Pulsar topics to Aerospike clusters. + +## Configuration + +The configuration of the Aerospike sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `seedHosts` |String| true | No default value| The comma-separated list of one or more Aerospike cluster hosts.

    Each host can be specified as a valid IP address or hostname followed by an optional port number. | +| `keyspace` | String| true |No default value |The Aerospike namespace. | +| `columnName` | String | true| No default value|The Aerospike column name. | +|`userName`|String|false|NULL|The Aerospike username.| +|`password`|String|false|NULL|The Aerospike password.| +| `keySet` | String|false |NULL | The Aerospike set name. | +| `maxConcurrentRequests` |int| false | 100 | The maximum number of concurrent Aerospike transactions that a sink can open. | +| `timeoutMs` | int|false | 100 | This property controls `socketTimeout` and `totalTimeout` for Aerospike transactions. | +| `retries` | int|false | 1 |The maximum number of retries before aborting a write transaction to Aerospike. | diff --git a/site2/website/versioned_docs/version-2.10.x/io-canal-source.md b/site2/website/versioned_docs/version-2.10.x/io-canal-source.md new file mode 100644 index 0000000000000..d1fd43bb0f74e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-canal-source.md @@ -0,0 +1,235 @@ +--- +id: io-canal-source +title: Canal source connector +sidebar_label: "Canal source connector" +original_id: io-canal-source +--- + +The Canal source connector pulls messages from MySQL to Pulsar topics. + +## Configuration + +The configuration of Canal source connector has the following properties. + +### Property + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `username` | true | None | Canal server account (not MySQL).| +| `password` | true | None | Canal server password (not MySQL). | +|`destination`|true|None|Source destination that Canal source connector connects to. +| `singleHostname` | false | None | Canal server address.| +| `singlePort` | false | None | Canal server port.| +| `cluster` | true | false | Whether to enable cluster mode based on Canal server configuration or not.

  • true: **cluster** mode.
    If set to true, it talks to `zkServers` to figure out the actual database host.

  • false: **standalone** mode.
    If set to false, it connects to the database specified by `singleHostname` and `singlePort`.
  • | +| `zkServers` | true | None | Address and port of the Zookeeper that Canal source connector talks to figure out the actual database host.| +| `batchSize` | false | 1000 | Batch size to fetch from Canal. | + +### Example + +Before using the Canal connector, you can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "zkServers": "127.0.0.1:2181", + "batchSize": "5120", + "destination": "example", + "username": "", + "password": "", + "cluster": false, + "singleHostname": "127.0.0.1", + "singlePort": "11111", + } + + ``` + +* YAML + + You can create a YAML file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/resources/canal-mysql-source-config.yaml) below to your YAML file. + + ```yaml + + configs: + zkServers: "127.0.0.1:2181" + batchSize: 5120 + destination: "example" + username: "" + password: "" + cluster: false + singleHostname: "127.0.0.1" + singlePort: 11111 + + ``` + +## Usage + +Here is an example of storing MySQL data using the configuration file as above. + +1. Start a MySQL server. + + ```bash + + $ docker pull mysql:5.7 + $ docker run -d -it --rm --name pulsar-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=canal -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw mysql:5.7 + + ``` + +2. Create a configuration file `mysqld.cnf`. + + ```bash + + [mysqld] + pid-file = /var/run/mysqld/mysqld.pid + socket = /var/run/mysqld/mysqld.sock + datadir = /var/lib/mysql + #log-error = /var/log/mysql/error.log + # By default we only accept connections from localhost + #bind-address = 127.0.0.1 + # Disabling symbolic-links is recommended to prevent assorted security risks + symbolic-links=0 + log-bin=mysql-bin + binlog-format=ROW + server_id=1 + + ``` + +3. Copy the configuration file `mysqld.cnf` to MySQL server. + + ```bash + + $ docker cp mysqld.cnf pulsar-mysql:/etc/mysql/mysql.conf.d/ + + ``` + +4. Restart the MySQL server. + + ```bash + + $ docker restart pulsar-mysql + + ``` + +5. Create a test database in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal -e 'create database test;' + + ``` + +6. Start a Canal server and connect to MySQL server. + + ``` + + $ docker pull canal/canal-server:v1.1.2 + $ docker run -d -it --link pulsar-mysql -e canal.auto.scan=false -e canal.destinations=test -e canal.instance.master.address=pulsar-mysql:3306 -e canal.instance.dbUsername=root -e canal.instance.dbPassword=canal -e canal.instance.connectionCharset=UTF-8 -e canal.instance.tsdb.enable=true -e canal.instance.gtidon=false --name=pulsar-canal-server -p 8000:8000 -p 2222:2222 -p 11111:11111 -p 11112:11112 -m 4096m canal/canal-server:v1.1.2 + + ``` + +7. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:2.3.0 + $ docker run -d -it --link pulsar-canal-server -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:2.3.0 bin/pulsar standalone + + ``` + +8. Modify the configuration file `canal-mysql-source-config.yaml`. + + ```yaml + + configs: + zkServers: "" + batchSize: "5120" + destination: "test" + username: "" + password: "" + cluster: false + singleHostname: "pulsar-canal-server" + singlePort: "11111" + + ``` + +9. Create a consumer file `pulsar-client.py`. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', + subscription_name='my-sub') + + while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +10. Copy the configuration file `canal-mysql-source-config.yaml` and the consumer file `pulsar-client.py` to Pulsar server. + + ```bash + + $ docker cp canal-mysql-source-config.yaml pulsar-standalone:/pulsar/conf/ + $ docker cp pulsar-client.py pulsar-standalone:/pulsar/ + + ``` + +11. Download a Canal connector and start it. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.3.0/connectors/pulsar-io-canal-2.3.0.nar -P connectors + $ ./bin/pulsar-admin source localrun \ + --archive ./connectors/pulsar-io-canal-2.3.0.nar \ + --classname org.apache.pulsar.io.canal.CanalStringSource \ + --tenant public \ + --namespace default \ + --name canal \ + --destination-topic-name my-topic \ + --source-config-file /pulsar/conf/canal-mysql-source-config.yaml \ + --parallelism 1 + + ``` + +12. Consume data from MySQL. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ python pulsar-client.py + + ``` + +13. Open another window to log in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal + + ``` + +14. Create a table, and insert, delete, and update data in MySQL server. + + ```bash + + mysql> use test; + mysql> show tables; + mysql> CREATE TABLE IF NOT EXISTS `test_table`(`test_id` INT UNSIGNED AUTO_INCREMENT,`test_title` VARCHAR(100) NOT NULL, + `test_author` VARCHAR(40) NOT NULL, + `test_date` DATE,PRIMARY KEY ( `test_id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; + mysql> INSERT INTO test_table (test_title, test_author, test_date) VALUES("a", "b", NOW()); + mysql> UPDATE test_table SET test_title='c' WHERE test_title='a'; + mysql> DELETE FROM test_table WHERE test_title='c'; + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-cassandra-sink.md b/site2/website/versioned_docs/version-2.10.x/io-cassandra-sink.md new file mode 100644 index 0000000000000..d7f0e55abaa31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-cassandra-sink.md @@ -0,0 +1,59 @@ +--- +id: io-cassandra-sink +title: Cassandra sink connector +sidebar_label: "Cassandra sink connector" +original_id: io-cassandra-sink +--- + +The Cassandra sink connector pulls messages from Pulsar topics to Cassandra clusters. + +## Configuration + +The configuration of the Cassandra sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `roots` | String|true | " " (empty string) | A comma-separated list of Cassandra hosts to connect to.| +| `keyspace` | String|true| " " (empty string)| The key space used for writing pulsar messages.

    **Note: `keyspace` should be created prior to a Cassandra sink.**| +| `keyname` | String|true| " " (empty string)| The key name of the Cassandra column family.

    The column is used for storing Pulsar message keys.

    If a Pulsar message doesn't have any key associated, the message value is used as the key. | +| `columnFamily` | String|true| " " (empty string)| The Cassandra column family name.

    **Note: `columnFamily` should be created prior to a Cassandra sink.**| +| `columnName` | String|true| " " (empty string) | The column name of the Cassandra column family.

    The column is used for storing Pulsar message values. | + +### Example + +Before using the Cassandra sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + } + + ``` + +* YAML + + ``` + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +## Usage + +For more information about **how to connect Pulsar with Cassandra**, see [here](io-quickstart.md#connect-pulsar-to-apache-cassandra). diff --git a/site2/website/versioned_docs/version-2.10.x/io-cdc-debezium.md b/site2/website/versioned_docs/version-2.10.x/io-cdc-debezium.md new file mode 100644 index 0000000000000..4558ae41d211b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-cdc-debezium.md @@ -0,0 +1,549 @@ +--- +id: io-cdc-debezium +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-cdc-debezium +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL for the offset topic used in Debezium. You can use the `bin/pulsar-admin --admin-url http://pulsar:8080 sources localrun --source-config-file configs/pg-pulsar-config.yaml` command to point to the target Pulsar cluster.| +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + + + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "configs": { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "configs": { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-postgres:0.8 + $ docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +* JSON + + ```json + + { + "configs": { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.10 + mongodb.hosts: "rs0/mongodb:27017" + mongodb.name: "dbserver1" + mongodb.user: "debezium" + mongodb.password: "dbz" + mongodb.task.id: "1" + database.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-cdc.md b/site2/website/versioned_docs/version-2.10.x/io-cdc.md new file mode 100644 index 0000000000000..e6e662884826d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-cdc.md @@ -0,0 +1,26 @@ +--- +id: io-cdc +title: CDC connector +sidebar_label: "CDC connector" +original_id: io-cdc +--- + +CDC source connectors capture log changes of databases (such as MySQL, MongoDB, and PostgreSQL) into Pulsar. + +> CDC source connectors are built on top of [Canal](https://github.com/alibaba/canal) and [Debezium](https://debezium.io/) and store all data into Pulsar cluster in a persistent, replicated, and partitioned way. + +Currently, Pulsar has the following CDC connectors. + +Name|Java Class +|---|--- +[Canal source connector](io-canal-source.md)|[org.apache.pulsar.io.canal.CanalStringSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) +[Debezium source connector](io-cdc-debezium.md)|
  • [org.apache.pulsar.io.debezium.DebeziumSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/core/src/main/java/org/apache/pulsar/io/debezium/DebeziumSource.java)
  • [org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java)
  • [org.apache.pulsar.io.debezium.postgres.DebeziumPostgresSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java)
  • + +For more information about Canal and Debezium, see the information below. + +Subject | Reference +|---|--- +How to use Canal source connector with MySQL|[Canal guide](https://github.com/alibaba/canal/wiki) +How does Canal work | [Canal tutorial](https://github.com/alibaba/canal/wiki) +How to use Debezium source connector with MySQL | [Debezium guide](https://debezium.io/docs/connectors/mysql/) +How does Debezium work | [Debezium tutorial](https://debezium.io/docs/tutorial/) diff --git a/site2/website/versioned_docs/version-2.10.x/io-cli.md b/site2/website/versioned_docs/version-2.10.x/io-cli.md new file mode 100644 index 0000000000000..f79d301c30b3f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-cli.md @@ -0,0 +1,666 @@ +--- +id: io-cli +title: Connector Admin CLI +sidebar_label: "CLI" +original_id: io-cli +--- + +:::note + +**Important** + +This page is deprecated and not updated anymore. For the latest and complete information about `Pulsar-admin`, including commands, flags, descriptions, and more, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The `pulsar-admin` tool helps you manage Pulsar connectors. + +## `sources` + +An interface for managing Pulsar IO sources (ingress data into Pulsar). + +```bash + +$ pulsar-admin sources subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sources` + +* `reload` + + +### `create` + +Submit a Pulsar IO source connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `update` + +Update a already submitted Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. The `source-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--tenant` | The source's tenant. +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + + +### `delete` + +Delete a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `get` + +Get the information about a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `status` + +Check the current status of a Pulsar Source. + +#### Usage + +```bash + +$ pulsar-admin sources status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `list` + +List all running Pulsar IO source connectors. + +#### Usage + +```bash + +$ pulsar-admin sources list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `stop` + +Stop a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `start` + +Start a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `restart` + +Restart a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources restart options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `localrun` + +Run a Pulsar IO source connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the Source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The source's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--deserialization-classname`|The SerDe classname for the source. +|`--destination-topic-name`|The Pulsar topic to which data is sent. +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +|`--name`|The source’s name.| +|`--namespace`|The source’s namespace.| +|`--parallelism`|The source’s parallelism factor, that is, the number of source instances to run).| +|`--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +|`--source-config`|Source config key/values. +|`--source-config-file`|The path to a YAML config file specifying the source’s configuration. +|`--source-type`|The source's connector provider. +|`--tenant`|The source’s tenant. +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +|`--use-tls`|Use tls connection.
    **Default value: false**. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `available-sources` + +Get the list of Pulsar IO connector sources supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources available-sources + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sources reload + +``` + +## `sinks` + +An interface for managing Pulsar IO sinks (egress data from Pulsar). + +```bash + +$ pulsar-admin sinks subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sinks` + +* `reload` + + +### `create` + +Submit a Pulsar IO sink connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). + +### `update` + +Update a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + +### `delete` + +Delete a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `get` + +Get the information about a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `status` + +Check the current status of a Pulsar sink. + +#### Usage + +```bash + +$ pulsar-admin sinks status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `list` + +List all running Pulsar IO sink connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `stop` + +Stop a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `start` + +Start a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `restart` + +Restart a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks restart options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `localrun` + +Run a Pulsar IO sink connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The sink's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime). +| `--custom-schema-inputs` | The map of input topics to Schema types or class names (as a JSON string). +| `--max-redeliver-count` | Maximum number of times that a message is redelivered before being sent to the dead letter queue. +| `--dead-letter-topic` | Name of the dead letter topic where the failing messages are sent. +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +| `-i`, `--inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name`|The sink’s name.| +|`--namespace`|The sink’s namespace.| +|`--parallelism`|The sink’s parallelism factor, that is, the number of sink instances to run).| +|`--processing-guarantees`|The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--retain-ordering` | Sink consumes and sinks messages in order. +|`--sink-config`|sink config key/values. +|`--sink-config-file`|The path to a YAML config file specifying the sink’s configuration. +|`--sink-type`|The sink's connector provider. +|`--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +|`--tenant`|The sink’s tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--negative-ack-redelivery-delay-ms` | The negatively-acknowledged message redelivery delay in milliseconds. | +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +|`--use-tls`|Use tls connection.
    **Default value: false**. + +### `available-sinks` + +Get the list of Pulsar IO connector sinks supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks available-sinks + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks reload + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-connectors.md b/site2/website/versioned_docs/version-2.10.x/io-connectors.md new file mode 100644 index 0000000000000..957a02a5a1964 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-connectors.md @@ -0,0 +1,249 @@ +--- +id: io-connectors +title: Built-in connector +sidebar_label: "Built-in connector" +original_id: io-connectors +--- + +Pulsar distribution includes a set of common connectors that have been packaged and tested with the rest of Apache Pulsar. These connectors import and export data from some of the most commonly used data systems. + +Using any of these connectors is as easy as writing a simple connector and running the connector locally or submitting the connector to a Pulsar Functions cluster. + +## Source connector + +Pulsar has various source connectors, which are sorted alphabetically as below. + +### Canal + +* [Configuration](io-canal-source.md#configuration) + +* [Example](io-canal-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) + + +### Debezium MySQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mysql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java) + +### Debezium PostgreSQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java) + +### Debezium MongoDB + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mongodb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/java/org/apache/pulsar/io/debezium/mongodb/DebeziumMongoDbSource.java) + +### Debezium Oracle + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-oracle) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/oracle/src/main/java/org/apache/pulsar/io/debezium/oracle/DebeziumOracleSource.java) + +### Debezium Microsoft SQL Server + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-microsoft-sql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mssql/src/main/java/org/apache/pulsar/io/debezium/mssql/DebeziumMsSqlSource.java) + + +### DynamoDB + +* [Configuration](io-dynamodb-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/dynamodb/src/main/java/org/apache/pulsar/io/dynamodb/DynamoDBSource.java) + +### File + +* [Configuration](io-file-source.md#configuration) + +* [Example](io-file-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/file/src/main/java/org/apache/pulsar/io/file/FileSource.java) + +### Flume + +* [Configuration](io-flume-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/FlumeConnector.java) + +### Twitter firehose + +* [Configuration](io-twitter-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java) + +### Kafka + +* [Configuration](io-kafka-source.md#configuration) + +* [Example](io-kafka-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java) + +### Kinesis + +* [Configuration](io-kinesis-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSource.java) + +### Netty + +* [Configuration](io-netty-source.md#configuration) + +* [Example of TCP](io-netty-source.md#tcp) + +* [Example of HTTP](io-netty-source.md#http) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/netty/src/main/java/org/apache/pulsar/io/netty/NettySource.java) + +### NSQ + +* [Configuration](io-nsq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/nsq/src/main/java/org/apache/pulsar/io/nsq/NSQSource.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java) + +## Sink connector + +Pulsar has various sink connectors, which are sorted alphabetically as below. + +### Aerospike + +* [Configuration](io-aerospike-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java) + +### Cassandra + +* [Configuration](io-cassandra-sink.md#configuration) + +* [Example](io-cassandra-sink.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java) + +### ElasticSearch + +* [Configuration](io-elasticsearch-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/elastic-search/src/main/java/org/apache/pulsar/io/elasticsearch/ElasticSearchSink.java) + +### Flume + +* [Configuration](io-flume-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/sink/StringSink.java) + +### HBase + +* [Configuration](io-hbase-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hbase/src/main/java/org/apache/pulsar/io/hbase/HbaseAbstractConfig.java) + +### HDFS2 + +* [Configuration](io-hdfs2-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs2/src/main/java/org/apache/pulsar/io/hdfs2/AbstractHdfsConnector.java) + +### HDFS3 + +* [Configuration](io-hdfs3-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs3/src/main/java/org/apache/pulsar/io/hdfs3/AbstractHdfsConnector.java) + +### InfluxDB + +* [Configuration](io-influxdb-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/influxdb/src/main/java/org/apache/pulsar/io/influxdb/InfluxDBGenericRecordSink.java) + +### JDBC ClickHouse + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-clickhouse) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/clickhouse/src/main/java/org/apache/pulsar/io/jdbc/ClickHouseJdbcAutoSchemaSink.java) + +### JDBC MariaDB + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-mariadb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/mariadb/src/main/java/org/apache/pulsar/io/jdbc/MariadbJdbcAutoSchemaSink.java) + +### JDBC PostgreSQL + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/postgres/src/main/java/org/apache/pulsar/io/jdbc/PostgresJdbcAutoSchemaSink.java) + +### JDBC SQLite + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-sqlite) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/sqlite/src/main/java/org/apache/pulsar/io/jdbc/SqliteJdbcAutoSchemaSink.java) + +### Kafka + +* [Configuration](io-kafka-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java) + +### Kinesis + +* [Configuration](io-kinesis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSink.java) + +### MongoDB + +* [Configuration](io-mongo-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/mongo/src/main/java/org/apache/pulsar/io/mongodb/MongoSink.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSink.java) + +### Redis + +* [Configuration](io-redis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/redis/src/main/java/org/apache/pulsar/io/redis/RedisAbstractConfig.java) + +### Solr + +* [Configuration](io-solr-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/solr/src/main/java/org/apache/pulsar/io/solr/SolrSinkConfig.java) + diff --git a/site2/website/versioned_docs/version-2.10.x/io-debezium-source.md b/site2/website/versioned_docs/version-2.10.x/io-debezium-source.md new file mode 100644 index 0000000000000..aedbd18dce421 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-debezium-source.md @@ -0,0 +1,800 @@ +--- +id: io-debezium-source +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-debezium-source +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `json-with-envelope` | false | false | Present the message only consist of payload. +| `database.history.pulsar.reader.config` | false | null | The configs of the reader for the database schema history topic, in the form of a JSON string with key-value pairs.
    **Note:** This property is only available in 2.10.2 and later versions. | +| `offset.storage.reader.config` | false | null | The configs of the reader for the kafka connector offsets topic, in the form of a JSON string with key-value pairs.
    **Note:** This property is only available in 2.10.2 and later versions.| + +### Converter Options + +1. org.apache.kafka.connect.json.JsonConverter + +This config `json-with-envelope` is valid only for the JsonConverter. It's default value is false, the consumer use the schema ` +Schema.KeyValue(Schema.AUTO_CONSUME(), Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, +and the message only consist of payload. + +If the config `json-with-envelope` value is true, the consumer use the schema +`Schema.KeyValue(Schema.BYTES, Schema.BYTES`, the message consist of schema and payload. + +2. org.apache.pulsar.kafka.shade.io.confluent.connect.avro.AvroConverter + +If users select the AvroConverter, then the pulsar consumer should use the schema `Schema.KeyValue(Schema.AUTO_CONSUME(), +Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, and the message consist of payload. + +### MongoDB Configuration +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + +### Customize the Reader config for the metadata topics + +:::note + +The customization is only available in 2.10.2 and later versions. + +::: + +The Debezium Connector exposes `database.history.pulsar.reader.config` and `offset.storage.reader.config` to configure the reader of database schema history topic and the Kafka connector offsets topic. For example, it can be used to configure the subscription name and other reader configurations. You can find the available configurations at [ReaderConfigurationData](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/conf/ReaderConfigurationData.java). + +For example, to configure the subscription name for both Readers, you can add the following configuration: +* JSON + + ```json + { + "configs": { + "database.history.pulsar.reader.config": "{\"subscriptionName\":\"history-reader\"}", + "offset.storage.reader.config": "{\"subscriptionName\":\"offset-reader\"}", + } + } + ``` + +* YAML + + ```yaml + configs: + database.history.pulsar.reader.config: "{\"subscriptionName\":\"history-reader\"}" + offset.storage.reader.config: "{\"subscriptionName\":\"offset-reader\"}" + ``` + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "configs": { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "offset.storage.topic": "offset-topic" + } + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "changeme", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "plugin.name": "pgoutput", + "schema.whitelist": "public", + "table.whitelist": "public.users", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for postgres version 10+, official docker image: postgres:<10+> + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "changeme" + database.dbname: "postgres" + database.server.name: "dbserver1" + plugin.name: "pgoutput" + schema.whitelist: "public" + table.whitelist: "public.users" + + ## PULSAR_SERVICE_URL_CONFIG + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +Notice that `pgoutput` is a standard plugin of Postgres introduced in version 10 - [see Postgres architecture docu](https://www.postgresql.org/docs/10/logical-replication-architecture.html). You don't need to install anything, just make sure the WAL level is set to `logical` (see docker command below and [Postgres docu](https://www.postgresql.org/docs/current/runtime-config-wal.html)). + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=changeme \ + postgres:13.3 -c wal_level=logical + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "changeme","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "public","table.whitelist": "public.users","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-users_ for the _public.users_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-users" public/default/dbserver1.public.users -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to create sample data in the table _users_. + + ``` + + psql -U postgres -h localhost -p 5432 + Password for user postgres: + + CREATE TABLE users( + id BIGINT GENERATED ALWAYS AS IDENTITY, PRIMARY KEY(id), + hash_firstname TEXT NOT NULL, + hash_lastname TEXT NOT NULL, + gender VARCHAR(6) NOT NULL CHECK (gender IN ('male', 'female')) + ); + + INSERT INTO users(hash_firstname, hash_lastname, gender) + SELECT md5(RANDOM()::TEXT), md5(RANDOM()::TEXT), CASE WHEN RANDOM() < 0.5 THEN 'male' ELSE 'female' END FROM generate_series(1, 100); + + postgres=# select * from users; + + id | hash_firstname | hash_lastname | gender + -------+----------------------------------+----------------------------------+-------- + 1 | 02bf7880eb489edc624ba637f5ab42bd | 3e742c2cc4217d8e3382cc251415b2fb | female + 2 | dd07064326bb9119189032316158f064 | 9c0e938f9eddbd5200ba348965afbc61 | male + 3 | 2c5316fdd9d6595c1cceb70eed12e80c | 8a93d7d8f9d76acfaaa625c82a03ea8b | female + 4 | 3dfa3b4f70d8cd2155567210e5043d2b | 32c156bc28f7f03ab5d28e2588a3dc19 | female + + + postgres=# UPDATE users SET hash_firstname='maxim' WHERE id=1; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"before":null,"after":{"id":1,"hash_firstname":"maxim","hash_lastname":"292113d30a3ccee0e19733dd7f88b258","gender":"male"},"source:{"version":"1.0.0.Final","connector":"postgresql","name":"foobar","ts_ms":1624045862644,"snapshot":"false","db":"postgres","schema":"public","table":"users","txId":595,"lsn":24419784,"xmin":null},"op":"u","ts_ms":1624045862648} + ...many more + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-mongodb:0.10 + mongodb.hosts: "rs0/mongodb:27017" + mongodb.name: "dbserver1" + mongodb.user: "debezium" + mongodb.password: "dbz" + mongodb.task.id: "1" + database.whitelist: "inventory" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## Example of Oracle + +### Packaging + +Oracle connector does not include Oracle JDBC driver and you need to package it with the connector. +Major reasons for not including the drivers are the variety of versions and Oracle licensing. It is recommended to use the driver provided with your Oracle DB installation, or you can [download](https://www.oracle.com/database/technologies/appdev/jdbc.html) one. +Integration test have an [example](https://github.com/apache/pulsar/blob/e2bc52d40450fa00af258c4432a5b71d50a5c6e0/tests/docker-images/latest-version-image/Dockerfile#L110-L122) of packaging the driver into the connector nar file. + +### Configuration + +Debezium [requires](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-overview) Oracle DB with LogMiner or XStream API enabled. +Supported options and steps for enabling them vary from version to version of Oracle DB. +Steps outlined in the [documentation](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-overview) and used in the [integration test](https://github.com/apache/pulsar/blob/master/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io/sources/debezium/DebeziumOracleDbSourceTester.java) may or may not work for the version and edition of Oracle DB you are using. +Please refer to the [documentation for Oracle DB](https://docs.oracle.com/en/database/oracle/oracle-database/) as needed. + +Similarly to other connectors, you can use JSON or YAMl to configure the connector. +Using yaml as an example, you can create a debezium-oracle-source-config.yaml file like: + +* JSON + +```json + +{ + "database.hostname": "localhost", + "database.port": "1521", + "database.user": "dbzuser", + "database.password": "dbz", + "database.dbname": "XE", + "database.server.name": "XE", + "schema.exclude.list": "system,dbzuser", + "snapshot.mode": "initial", + "topic.namespace": "public/default", + "task.class": "io.debezium.connector.oracle.OracleConnectorTask", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "typeClassName": "org.apache.pulsar.common.schema.KeyValue", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.tcpKeepAlive": "true", + "decimal.handling.mode": "double", + "database.history.pulsar.topic": "debezium-oracle-source-history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" +} + +``` + +* YAML + +```yaml + +tenant: "public" +namespace: "default" +name: "debezium-oracle-source" +topicName: "debezium-oracle-topic" +parallelism: 1 + +className: "org.apache.pulsar.io.debezium.oracle.DebeziumOracleSource" +database.dbname: "XE" + +configs: + database.hostname: "localhost" + database.port: "1521" + database.user: "dbzuser" + database.password: "dbz" + database.dbname: "XE" + database.server.name: "XE" + schema.exclude.list: "system,dbzuser" + snapshot.mode: "initial" + topic.namespace: "public/default" + task.class: "io.debezium.connector.oracle.OracleConnectorTask" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + key.converter: "org.apache.kafka.connect.json.JsonConverter" + typeClassName: "org.apache.pulsar.common.schema.KeyValue" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.tcpKeepAlive: "true" + decimal.handling.mode: "double" + database.history.pulsar.topic: "debezium-oracle-source-history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + +``` + +For the full list of configuration properties supported by Debezium, see [Debezium Connector for Oracle](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-connector-properties). + +## Example of Microsoft SQL + +### Configuration + +Debezium [requires](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#sqlserver-overview) SQL Server with CDC enabled. +Steps outlined in the [documentation](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#setting-up-sqlserver) and used in the [integration test](https://github.com/apache/pulsar/blob/master/tests/integration/src/test/java/org/apache/pulsar/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io/sources/debezium/DebeziumMsSqlSourceTester.java). +For more information, see [Enable and disable change data capture in Microsoft SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-data-capture-sql-server). + +Similarly to other connectors, you can use JSON or YAMl to configure the connector. + +* JSON + +```json + +{ + "database.hostname": "localhost", + "database.port": "1433", + "database.user": "sa", + "database.password": "MyP@ssw0rd!", + "database.dbname": "MyTestDB", + "database.server.name": "mssql", + "snapshot.mode": "schema_only", + "topic.namespace": "public/default", + "task.class": "io.debezium.connector.sqlserver.SqlServerConnectorTask", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "typeClassName": "org.apache.pulsar.common.schema.KeyValue", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.tcpKeepAlive": "true", + "decimal.handling.mode": "double", + "database.history.pulsar.topic": "debezium-mssql-source-history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" +} + +``` + +* YAML + +```yaml + +tenant: "public" +namespace: "default" +name: "debezium-mssql-source" +topicName: "debezium-mssql-topic" +parallelism: 1 + +className: "org.apache.pulsar.io.debezium.mssql.DebeziumMsSqlSource" +database.dbname: "mssql" + +configs: + database.hostname: "localhost" + database.port: "1433" + database.user: "sa" + database.password: "MyP@ssw0rd!" + database.dbname: "MyTestDB" + database.server.name: "mssql" + snapshot.mode: "schema_only" + topic.namespace: "public/default" + task.class: "io.debezium.connector.sqlserver.SqlServerConnectorTask" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + key.converter: "org.apache.kafka.connect.json.JsonConverter" + typeClassName: "org.apache.pulsar.common.schema.KeyValue" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.tcpKeepAlive: "true" + decimal.handling.mode: "double" + database.history.pulsar.topic: "debezium-mssql-source-history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + +``` + +For the full list of configuration properties supported by Debezium, see [Debezium Connector for MS SQL](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#sqlserver-connector-properties). + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-debug.md b/site2/website/versioned_docs/version-2.10.x/io-debug.md new file mode 100644 index 0000000000000..890d5f692f7b1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-debug.md @@ -0,0 +1,407 @@ +--- +id: io-debug +title: How to debug Pulsar connectors +sidebar_label: "Debug" +original_id: io-debug +--- +This guide explains how to debug connectors in localrun or cluster mode and gives a debugging checklist. +To better demonstrate how to debug Pulsar connectors, here takes a Mongo sink connector as an example. + +**Deploy a Mongo sink environment** +1. Start a Mongo service. + + ```bash + + docker pull mongo:4 + docker run -d -p 27017:27017 --name pulsar-mongo -v $PWD/data:/data/db mongo:4 + + ``` + +2. Create a DB and a collection. + + ```bash + + docker exec -it pulsar-mongo /bin/bash + mongo + > use pulsar + > db.createCollection('messages') + > exit + + ``` + +3. Start Pulsar standalone. + + ```bash + + docker pull apachepulsar/pulsar:2.4.0 + docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --link pulsar-mongo --name pulsar-mongo-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + + ``` + +4. Configure the Mongo sink with the `mongo-sink-config.yaml` file. + + ```bash + + configs: + mongoUri: "mongodb://pulsar-mongo:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + + ```bash + + docker cp mongo-sink-config.yaml pulsar-mongo-standalone:/pulsar/ + + ``` + +5. Download the Mongo sink nar package. + + ```bash + + docker exec -it pulsar-mongo-standalone /bin/bash + curl -O http://apache.01link.hk/pulsar/pulsar-2.4.0/connectors/pulsar-io-mongo-2.4.0.nar + + ``` + +## Debug in localrun mode +Start the Mongo sink in localrun mode using the `localrun` command. +:::tip + +For more information about the `localrun` command, see [`localrun`](reference-connector-admin.md/#localrun-1). + +::: + +```bash + +./bin/pulsar-admin sinks localrun \ +--archive connectors/pulsar-io-mongo-@pulsar:version@.nar \ +--tenant public --namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### Use connector log +Use one of the following methods to get a connector log in localrun mode: +* After executing the `localrun` command, the **log is automatically printed on the console**. +* The log is located at: + + ```bash + + logs/functions/tenant/namespace/function-name/function-name-instance-id.log + + ``` + + **Example** + + The path of the Mongo sink connector is: + + ```bash + + logs/functions/public/default/pulsar-mongo-sink/pulsar-mongo-sink-0.log + + ``` + +To clearly explain the log information, here breaks down the large block of information into small blocks and add descriptions for each block. +* This piece of log information shows the storage path of the nar package after decompression. + + ``` + + 08:21:54.132 [main] INFO org.apache.pulsar.common.nar.NarClassLoader - Created class loader with paths: [file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/, file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/, + + ``` + + :::tip + + If `class cannot be found` exception is thrown, check whether the nar file is decompressed in the folder `file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/` or not. + + ::: + +* This piece of log information illustrates the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, resources, and so on, which can be used to **check whether the Mongo sink connector is configured correctly or not**. + + ```bash + + 08:21:55.390 [main] INFO org.apache.pulsar.functions.runtime.ThreadRuntime - ThreadContainer starting function with instance config InstanceConfig(instanceId=0, functionId=853d60a1-0c48-44d5-9a5c-6917386476b2, functionVersion=c2ce1458-b69e-4175-88c0-a0a856a2be8c, functionDetails=tenant: "public" + namespace: "default" + name: "pulsar-mongo-sink" + className: "org.apache.pulsar.functions.api.utils.IdentityFunction" + autoAck: true + parallelism: 1 + source { + typeClassName: "[B" + inputSpecs { + key: "test-mongo" + value { + } + } + cleanupSubscription: true + } + sink { + className: "org.apache.pulsar.io.mongodb.MongoSink" + configs: "{\"mongoUri\":\"mongodb://pulsar-mongo:27017\",\"database\":\"pulsar\",\"collection\":\"messages\",\"batchSize\":2,\"batchTimeMs\":500}" + typeClassName: "[B" + } + resources { + cpu: 1.0 + ram: 1073741824 + disk: 10737418240 + } + componentType: SINK + , maxBufferedTuples=1024, functionAuthenticationSpec=null, port=38459, clusterName=local) + + ``` + +* This piece of log information demonstrates the status of the connections to Mongo and configuration information. + + ```bash + + 08:21:56.231 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.connection - Opened connection [connectionId{localValue:1, serverValue:8}] to pulsar-mongo:27017 + 08:21:56.326 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.cluster - Monitor thread successfully connected to server with description ServerDescription{address=pulsar-mongo:27017, type=STANDALONE, state=CONNECTED, ok=true, version=ServerVersion{versionList=[4, 2, 0]}, minWireVersion=0, maxWireVersion=8, maxDocumentSize=16777216, logicalSessionTimeoutMinutes=30, roundTripTimeNanos=89058800} + + ``` + +* This piece of log information explains the configuration of consumers and clients, including the topic name, subscription name, subscription type, and so on. + + ```bash + + 08:21:56.719 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Starting Pulsar consumer status recorder with config: { + "topicNames" : [ "test-mongo" ], + "topicsPattern" : null, + "subscriptionName" : "public/default/pulsar-mongo-sink", + "subscriptionType" : "Shared", + "receiverQueueSize" : 1000, + "acknowledgementsGroupTimeMicros" : 100000, + "negativeAckRedeliveryDelayMicros" : 60000000, + "maxTotalReceiverQueueSizeAcrossPartitions" : 50000, + "consumerName" : null, + "ackTimeoutMillis" : 0, + "tickDurationMillis" : 1000, + "priorityLevel" : 0, + "cryptoFailureAction" : "CONSUME", + "properties" : { + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink", + "instance_id" : "0" + }, + "readCompacted" : false, + "subscriptionInitialPosition" : "Latest", + "patternAutoDiscoveryPeriod" : 1, + "regexSubscriptionMode" : "PersistentOnly", + "deadLetterPolicy" : null, + "autoUpdatePartitions" : true, + "replicateSubscriptionState" : false, + "resetIncludeHead" : false + } + 08:21:56.726 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Pulsar client config: { + "serviceUrl" : "pulsar://localhost:6650", + "authPluginClassName" : null, + "authParams" : null, + "operationTimeoutMs" : 30000, + "statsIntervalSeconds" : 60, + "numIoThreads" : 1, + "numListenerThreads" : 1, + "connectionsPerBroker" : 1, + "useTcpNoDelay" : true, + "useTls" : false, + "tlsTrustCertsFilePath" : null, + "tlsAllowInsecureConnection" : false, + "tlsHostnameVerificationEnable" : false, + "concurrentLookupRequest" : 5000, + "maxLookupRequest" : 50000, + "maxNumberOfRejectedRequestPerConnection" : 50, + "keepAliveIntervalSeconds" : 30, + "connectionTimeoutMs" : 10000, + "requestTimeoutMs" : 60000, + "defaultBackoffIntervalNanos" : 100000000, + "maxBackoffIntervalNanos" : 30000000000 + } + + ``` + +## Debug in cluster mode +You can use the following methods to debug a connector in cluster mode: +* [Use connector log](#use-connector-log) +* [Use admin CLI](#use-admin-cli) +### Use connector log +In cluster mode, multiple connectors can run on a worker. To find the log path of a specified connector, use the `workerId` to locate the connector log. +### Use admin CLI +Pulsar admin CLI helps you debug Pulsar connectors with the following subcommands: +* [`get`](#get) + +* [`status`](#status) +* [`topics stats`](#topics-stats) + +**Create a Mongo sink** + +```bash + +./bin/pulsar-admin sinks create \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public \ +--namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### `get` +Use the `get` command to get the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, and so on. + +```bash + +./bin/pulsar-admin sinks get --tenant public --namespace default --name pulsar-mongo-sink +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-mongo-sink", + "className": "org.apache.pulsar.io.mongodb.MongoSink", + "inputSpecs": { + "test-mongo": { + "isRegexPattern": false + } + }, + "configs": { + "mongoUri": "mongodb://pulsar-mongo:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": 2.0, + "batchTimeMs": 500.0 + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +:::tip + +For more information about the `get` command, see [`get`](reference-connector-admin.md/#get-1). + +::: + +### `status` +Use the `status` command to get the current status about the Mongo sink connector, such as the number of instance, the number of running instance, instanceId, workerId and so on. + +```bash + +./bin/pulsar-admin sinks status +--tenant public \ +--namespace default \ +--name pulsar-mongo-sink +{ +"numInstances" : 1, +"numRunning" : 1, +"instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-5d202832fd18-8080" + } +} ] +} + +``` + +:::tip + +For more information about the `status` command, see [`status`](reference-connector-admin.md/#stauts-1). +If there are multiple connectors running on a worker, `workerId` can locate the worker on which the specified connector is running. + +::: + +### `topics stats` +Use the `topics stats` command to get the stats for a topic and its connected producer and consumer, such as whether the topic has received messages or not, whether there is a backlog of messages or not, the available permits and other key information. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +```bash + +./bin/pulsar-admin topics stats test-mongo +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "averageMsgSize" : 0.0, + "storageSize" : 1, + "publishers" : [ ], + "subscriptions" : { + "public/default/pulsar-mongo-sink" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "msgBacklog" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Shared", + "msgRateExpired" : 0.0, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "consumerName" : "dffdd", + "availablePermits" : 999, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "metadata" : { + "instance_id" : "0", + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink" + }, + "connectedSince" : "2019-08-26T08:48:07.582Z", + "clientVersion" : "2.4.0", + "address" : "/172.17.0.3:57790" + } ], + "isReplicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled" +} + +``` + +:::tip + +For more information about the `topic stats` command, see [`topic stats`](/tools/pulsar-admin/). + +::: + +## Checklist +This checklist indicates the major areas to check when you debug connectors. It is a reminder of what to look for to ensure a thorough review and an evaluation tool to get the status of connectors. +* Does Pulsar start successfully? + +* Does the external service run normally? + +* Is the nar package complete? + +* Is the connector configuration file correct? + +* In localrun mode, run a connector and check the printed information (connector log) on the console. + +* In cluster mode: + + * Use the `get` command to get the basic information. + + * Use the `status` command to get the current status. + * Use the `topics stats` command to get the stats for a specified topic and its connected producers and consumers. + + * Check the connector log. +* Enter into the external system and verify the result. diff --git a/site2/website/versioned_docs/version-2.10.x/io-develop.md b/site2/website/versioned_docs/version-2.10.x/io-develop.md new file mode 100644 index 0000000000000..d6f4f8261ac82 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-develop.md @@ -0,0 +1,421 @@ +--- +id: io-develop +title: How to develop Pulsar connectors +sidebar_label: "Develop" +original_id: io-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to develop Pulsar connectors to move data +between Pulsar and other systems. + +Pulsar connectors are special [Pulsar Functions](functions-overview.md), so creating +a Pulsar connector is similar to creating a Pulsar function. + +Pulsar connectors come in two types: + +| Type | Description | Example +|---|---|--- +{@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java}|Import data from another system to Pulsar.|[RabbitMQ source connector](io-rabbitmq.md) imports the messages of a RabbitMQ queue to a Pulsar topic. +{@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java}|Export data from Pulsar to another system.|[Kinesis sink connector](io-kinesis.md) exports the messages of a Pulsar topic to a Kinesis stream. + +## Develop + +You can develop Pulsar source connectors and sink connectors. + +### Source + +Developing a source connector is to implement the {@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} +interface, which means you need to implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method and the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sourceContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SourceContext sourceContext) throws Exception; + + ``` + + This method is called when the source connector is initialized. + + In this method, you can retrieve all connector specific settings through the passed-in `config` parameter and initialize all necessary resources. + + For example, a Kafka connector can create a Kafka client in this `open` method. + + Besides, Pulsar runtime also provides a `SourceContext` for the + connector to access runtime resources for tasks like collecting metrics. The implementation can save the `SourceContext` for future use. + +2. Implement the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Reads the next message from source. + * If source does not have any new messages, this call should block. + * @return next message from source. The return result should never be null + * @throws Exception + */ + Record read() throws Exception; + + ``` + + If nothing to return, the implementation should be blocking rather than returning `null`. + + The returned {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should encapsulate the following information, which is needed by Pulsar IO runtime. + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following variables: + + |Variable|Required|Description + |---|---|--- + `TopicName`|No|Pulsar topic name from which the record is originated from. + `Key`|No| Messages can optionally be tagged with keys.

    For more information, see [Routing modes](concepts-messaging.md#routing-modes).| + `Value`|Yes|Actual data of the record. + `EventTime`|No|Event time of the record from the source. + `PartitionId`|No| If the record is originated from a partitioned source, it returns its `PartitionId`.

    `PartitionId` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `RecordSequence`|No|If the record is originated from a sequential source, it returns its `RecordSequence`.

    `RecordSequence` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `Properties` |No| If the record carries user-defined properties, it returns those properties. + `DestinationTopic`|No|Topic to which message should be written. + `Message`|No|A class which carries data sent by users.

    For more information, see [Message.java](https://github.com/apache/pulsar/blob/master/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/Message.java).| + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following methods: + + Method|Description + |---|--- + `ack` |Acknowledge that the record is fully processed. + `fail`|Indicate that the record fails to be processed. + +## Handle schema information + +Pulsar IO automatically handles the schema and provides a strongly typed API based on Java generics. +If you know the schema type that you are producing, you can declare the Java class relative to that type in your sink declaration. + +``` + +public class MySource implements Source { + public Record read() {} +} + +``` + +If you want to implement a source that works with any schema, you can go with `byte[]` (of `ByteBuffer`) and use Schema.AUTO_PRODUCE_BYTES(). + +``` + +public class MySource implements Source { + public Record read() { + + Schema wantedSchema = .... + Record myRecord = new MyRecordImplementation(); + .... + } + class MyRecordImplementation implements Record { + public byte[] getValue() { + return ....encoded byte[]...that represents the value + } + public Schema getSchema() { + return Schema.AUTO_PRODUCE_BYTES(wantedSchema); + } + } +} + +``` + +To handle the `KeyValue` type properly, follow the guidelines for your record implementation: +- It must implement {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/KVRecord.java} interface and implement `getKeySchema`,`getValueSchema`, and `getKeyValueEncodingType` +- It must return a `KeyValue` object as `Record.getValue()` +- It may return null in `Record.getSchema()` + +When Pulsar IO runtime encounters a `KVRecord`, it brings the following changes automatically: +- Set properly the `KeyValueSchema` +- Encode the Message Key and the Message Value according to the `KeyValueEncoding` (SEPARATED or INLINE) + +:::tip + +For more information about **how to create a source connector**, see {@inject: github:KafkaSource:/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java}. + +::: + +### Sink + +Developing a sink connector **is similar to** developing a source connector, that is, you need to implement the {@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} interface, which means implementing the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method and the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sinkContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SinkContext sinkContext) throws Exception; + + ``` + +2. Implement the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Write a message to Sink + * @param record record to write to sink + * @throws Exception + */ + void write(Record record) throws Exception; + + ``` + + During the implementation, you can decide how to write the `Value` and + the `Key` to the actual source, and leverage all the provided information such as + `PartitionId` and `RecordSequence` to achieve different processing guarantees. + + You also need to ack records (if messages are sent successfully) or fail records (if messages fail to send). + +## Handling Schema information + +Pulsar IO handles automatically the Schema and provides a strongly typed API based on Java generics. +If you know the Schema type that you are consuming from you can declare the Java class relative to that type in your Sink declaration. + +``` + +public class MySink implements Sink { + public void write(Record record) {} +} + +``` + +If you want to implement a sink that works with any schema, you can you go with the special GenericObject interface. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + if (genericObject != null) { + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + ... + } + .... + } +} + +``` + +In the case of AVRO, JSON, and Protobuf records (schemaType=AVRO,JSON,PROTOBUF_NATIVE), you can cast the +`genericObject` variable to `GenericRecord` and use `getFields()` and `getField()` API. +You are able to access the native AVRO record using `genericObject.getNativeObject()`. + +In the case of KeyValue type, you can access both the schema for the key and the schema for the value using this code. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + if (type == SchemaType.KEY_VALUE) { + KeyValue keyValue = (KeyValue) nativeObject; + Object key = keyValue.getKey(); + Object value = keyValue.getValue(); + + KeyValueSchema keyValueSchema = (KeyValueSchema) schema; + Schema keySchema = keyValueSchema.getKeySchema(); + Schema valueSchema = keyValueSchema.getValueSchema(); + } + .... + } +} + +``` + +## Test + +Testing connectors can be challenging because Pulsar IO connectors interact with two systems +that may be difficult to mock—Pulsar and the system to which the connector is connecting. + +It is +recommended writing special tests to test the connector functionalities as below +while mocking the external service. + +### Unit test + +You can create unit tests for your connector. + +### Integration test + +Once you have written sufficient unit tests, you can add +separate integration tests to verify end-to-end functionality. + +Pulsar uses [testcontainers](https://www.testcontainers.org/) **for all integration tests**. + +:::tip + +For more information about **how to create integration tests for Pulsar connectors**, see {@inject: github:IntegrationTests:/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io}. + +::: + +## Package + +Once you've developed and tested your connector, you need to package it so that it can be submitted +to a [Pulsar Functions](functions-overview.md) cluster. + +There are two methods to +work with Pulsar Functions' runtime, that is, [NAR](#nar) and [uber JAR](#uber-jar). + +:::note + +If you plan to package and distribute your connector for others to use, you are obligated to + +::: + +license and copyright your own code properly. Remember to add the license and copyright to +all libraries your code uses and to your distribution. +> +> If you use the [NAR](#nar) method, the NAR plugin +automatically creates a `DEPENDENCIES` file in the generated NAR package, including the proper +licensing and copyrights of all libraries of your connector. + +### NAR + +**NAR** stands for NiFi Archive, which is a custom packaging mechanism used by Apache NiFi, to provide +a bit of Java ClassLoader isolation. + +:::tip + +For more information about **how NAR works**, see [here](https://medium.com/hashmapinc/nifi-nar-files-explained-14113f7796fd). + +::: + +Pulsar uses the same mechanism for packaging **all** [built-in connectors](io-connectors.md). + +The easiest approach to package a Pulsar connector is to create a NAR package using [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin). + +Include this [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin) in your maven project for your connector as below. + +```xml + + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + + + +``` + +You must also create a `resources/META-INF/services/pulsar-io.yaml` file with the following contents: + +```yaml + +name: connector name +description: connector description +sourceClass: fully qualified class name (only if source connector) +sinkClass: fully qualified class name (only if sink connector) + +``` + +For Gradle users, there is a [Gradle Nar plugin available on the Gradle Plugin Portal](https://plugins.gradle.org/plugin/io.github.lhotari.gradle-nar-plugin). + +:::tip + +For more information about an **how to use NAR for Pulsar connectors**, see {@inject: github:TwitterFirehose:/pulsar-io/twitter/pom.xml}. + +::: + +### Uber JAR + +An alternative approach is to create an **uber JAR** that contains all of the connector's JAR files +and other resource files. No directory internal structure is necessary. + +You can use [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/examples/includes-excludes.html) to create a uber JAR as below: + +```xml + + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + *:* + + + + + + + +``` + +## Monitor + +Pulsar connectors enable you to move data in and out of Pulsar easily. It is important to ensure that the running connectors are healthy at any time. You can monitor Pulsar connectors that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar connectors expose the metrics that can be collected and used for monitoring the health of **Java** connectors. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java** connectors. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here is an example of how to customize metrics for a Java connector. + +````mdx-code-block + + + +``` + +public class TestMetricSink implements Sink { + + @Override + public void open(Map config, SinkContext sinkContext) throws Exception { + sinkContext.recordMetric("foo", 1); + } + + @Override + public void write(Record record) throws Exception { + + } + + @Override + public void close() throws Exception { + + } + } + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/io-dynamodb-source.md b/site2/website/versioned_docs/version-2.10.x/io-dynamodb-source.md new file mode 100644 index 0000000000000..0314be2529b4c --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-dynamodb-source.md @@ -0,0 +1,82 @@ +--- +id: io-dynamodb-source +title: AWS DynamoDB source connector +sidebar_label: "AWS DynamoDB source connector" +original_id: io-dynamodb-source +--- + +The DynamoDB source connector pulls data from DynamoDB table streams and persists data into Pulsar. + +This connector uses the [DynamoDB Streams Kinesis Adapter](https://github.com/awslabs/dynamodb-streams-kinesis-adapter), +which uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual +consuming of messages. The KCL uses DynamoDB to track state for consumers and requires cloudwatch access to log metrics. + + +## Configuration + +The configuration of the DynamoDB source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the KCL application. Must be unique, as it is used to define the table name for the dynamo table used for state tracking.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the KCL checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsEndpoint`|String|false|" " (empty string)|The DynamoDB Streams end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsDynamodbStreamArn`|String|true|" " (empty string)|The DynamoDB stream arn. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the DynamoDB source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsDynamodbStreamArn": "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsDynamodbStreamArn: "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-elasticsearch-sink.md b/site2/website/versioned_docs/version-2.10.x/io-elasticsearch-sink.md new file mode 100644 index 0000000000000..4a5e349413814 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-elasticsearch-sink.md @@ -0,0 +1,244 @@ +--- +id: io-elasticsearch-sink +title: Elasticsearch sink connector +sidebar_label: "Elasticsearch sink connector" +original_id: io-elasticsearch-sink +--- + +The Elasticsearch sink connector pulls messages from Pulsar topics and persists the messages to indexes. + + +## Feature + +### Handle data + +Since Pulsar 2.9.0, the Elasticsearch sink connector has the following ways of +working. You can choose one of them. + +Name | Description +---|---| +Raw processing | The sink reads from topics and passes the raw content to Elasticsearch.

    This is the **default** behavior.

    Raw processing was already available **in Pulsar 2.8.x**. +Schema aware | The sink uses the schema and handles AVRO, JSON, and KeyValue schema types while mapping the content to the Elasticsearch document.

    If you set `schemaEnable` to `true`, the sink interprets the contents of the message and you can define a **primary key** that in turn used as the special `_id` field on Elasticsearch. +

    This allows you to perform `UPDATE`, `INSERT`, and `DELETE` operations +to Elasticsearch driven by the logical primary key of the message.

    This +is very useful in a typical Change Data Capture scenario in which you follow the +changes on your database, write them to Pulsar (using the Debezium adapter for +instance), and then you write to Elasticsearch.

    You configure the +mapping of the primary key using the `primaryFields` configuration +entry.

    The `DELETE` operation can be performed when the primary key is +not empty and the remaining value is empty. Use the `nullValueAction` to +configure this behaviour. The default configuration simply ignores such empty +values. + +### Map multiple indexes + +Since Pulsar 2.9.0, the `indexName` property is no more required. If you omit it, the sink writes to an index name after the Pulsar topic name. + +### Enable bulk writes + +Since Pulsar 2.9.0, you can use bulk writes by setting the `bulkEnabled` property to `true`. + +### Enable secure connections via TLS + +Since Pulsar 2.9.0, you can enable secure connections with TLS. + +## Configuration + +The configuration of the Elasticsearch sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `elasticSearchUrl` | String| true |" " (empty string)| The URL of elastic search cluster to which the connector connects. | +| `indexName` | String| false |" " (empty string)| The index name to which the connector writes messages. The default value is the topic name. It accepts date formats in the name to support event time based index with the pattern `%{+}`. For example, suppose the event time of the record is 1645182000000L, the indexName is `logs-%{+yyyy-MM-dd}`, then the formatted index name would be `logs-2022-02-18`. | +| `schemaEnable` | Boolean | false | false | Turn on the Schema Aware mode. | +| `createIndexIfNeeded` | Boolean | false | false | Manage index if missing. | +| `maxRetries` | Integer | false | 1 | The maximum number of retries for elasticsearch requests. Use -1 to disable it. | +| `retryBackoffInMs` | Integer | false | 100 | The base time to wait when retrying an Elasticsearch request (in milliseconds). | +| `maxRetryTimeInSec` | Integer| false | 86400 | The maximum retry time interval in seconds for retrying an elasticsearch request. | +| `bulkEnabled` | Boolean | false | false | Enable the elasticsearch bulk processor to flush write requests based on the number or size of requests, or after a given period. | +| `bulkActions` | Integer | false | 1000 | The maximum number of actions per elasticsearch bulk request. Use -1 to disable it. | +| `bulkSizeInMb` | Integer | false |5 | The maximum size in megabytes of elasticsearch bulk requests. Use -1 to disable it. | +| `bulkConcurrentRequests` | Integer | false | 0 | The maximum number of in flight elasticsearch bulk requests. The default 0 allows the execution of a single request. A value of 1 means 1 concurrent request is allowed to be executed while accumulating new bulk requests. | +| `bulkFlushIntervalInMs` | Integer | false | -1 | The maximum period of time to wait for flushing pending writes when bulk writes are enabled. Default is -1 meaning not set. | +| `compressionEnabled` | Boolean | false |false | Enable elasticsearch request compression. | +| `connectTimeoutInMs` | Integer | false |5000 | The elasticsearch client connection timeout in milliseconds. | +| `connectionRequestTimeoutInMs` | Integer | false |1000 | The time in milliseconds for getting a connection from the elasticsearch connection pool. | +| `connectionIdleTimeoutInMs` | Integer | false |5 | Idle connection timeout to prevent a read timeout. | +| `keyIgnore` | Boolean | false |true | Whether to ignore the record key to build the Elasticsearch document `_id`. If primaryFields is defined, the connector extract the primary fields from the payload to build the document `_id` If no primaryFields are provided, elasticsearch auto generates a random document `_id`. | +| `primaryFields` | String | false | "id" | The comma separated ordered list of field names used to build the Elasticsearch document `_id` from the record value. If this list is a singleton, the field is converted as a string. If this list has 2 or more fields, the generated `_id` is a string representation of a JSON array of the field values. | +| `nullValueAction` | enum (IGNORE,DELETE,FAIL) | false | IGNORE | How to handle records with null values, possible options are IGNORE, DELETE or FAIL. Default is IGNORE the message. | +| `malformedDocAction` | enum (IGNORE,WARN,FAIL) | false | FAIL | How to handle elasticsearch rejected documents due to some malformation. Possible options are IGNORE, DELETE or FAIL. Default is FAIL the Elasticsearch document. | +| `stripNulls` | Boolean | false |true | If stripNulls is false, elasticsearch _source includes 'null' for empty fields (for example {"foo": null}), otherwise null fields are stripped. | +| `socketTimeoutInMs` | Integer | false |60000 | The socket timeout in milliseconds waiting to read the elasticsearch response. | +| `typeName` | String | false | "_doc" | The type name to which the connector writes messages to.

    The value should be set explicitly to a valid type name other than "_doc" for Elasticsearch version before 6.2, and left to default otherwise. | +| `indexNumberOfShards` | int| false |1| The number of shards of the index. | +| `indexNumberOfReplicas` | int| false |1 | The number of replicas of the index. | +| `username` | String| false |" " (empty string)| The username used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `password` | String| false | " " (empty string)|The password used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `ssl` | ElasticSearchSslConfig | false | | Configuration for TLS encrypted communication | + +### Definition of ElasticSearchSslConfig structure: + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `enabled` | Boolean| false | false | Enable SSL/TLS. | +| `hostnameVerification` | Boolean| false | true | Whether or not to validate node hostnames when using SSL. | +| `truststorePath` | String| false |" " (empty string)| The path to the truststore file. | +| `truststorePassword` | String| false |" " (empty string)| Truststore password. | +| `keystorePath` | String| false |" " (empty string)| The path to the keystore file. | +| `keystorePassword` | String| false |" " (empty string)| Keystore password. | +| `cipherSuites` | String| false |" " (empty string)| SSL/TLS cipher suites. | +| `protocols` | String| false |"TLSv1.2" | Comma separated list of enabled SSL/TLS protocols. | + +## Example + +Before using the Elasticsearch sink connector, you need to create a configuration file through one of the following methods. + +### Configuration + +#### For Elasticsearch After 6.2 + +* JSON + + ```json + + { + "configs": { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "username": "scooby", + "password": "doobie" + } + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + username: "scooby" + password: "doobie" + + ``` + +#### For Elasticsearch Before 6.2 + +* JSON + + ```json + + { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "typeName": "doc", + "username": "scooby", + "password": "doobie" + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + typeName: "doc" + username: "scooby" + password: "doobie" + + ``` + +### Usage + +1. Start a single node Elasticsearch cluster. + + ```bash + + $ docker run -p 9200:9200 -p 9300:9300 \ + -e "discovery.type=single-node" \ + docker.elastic.co/elasticsearch/elasticsearch:7.13.3 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + + Make sure the NAR file is available at `connectors/pulsar-io-elastic-search-@pulsar:version@.nar`. + +3. Start the Pulsar Elasticsearch connector in local run mode using one of the following methods. + * Use the **JSON** configuration as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config '{"elasticSearchUrl":"http://localhost:9200","indexName": "my_index","username": "scooby","password": "doobie"}' \ + --inputs elasticsearch_test + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config-file elasticsearch-sink.yml \ + --inputs elasticsearch_test + + ``` + +4. Publish records to the topic. + + ```bash + + $ bin/pulsar-client produce elasticsearch_test --messages "{\"a\":1}" + + ``` + +5. Check documents in Elasticsearch. + + * refresh the index + + ```bash + + $ curl -s http://localhost:9200/my_index/_refresh + + ``` + + + * search documents + + ```bash + + $ curl -s http://localhost:9200/my_index/_search + + ``` + + You can see the record that published earlier has been successfully written into Elasticsearch. + + ```json + + {"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"my_index","_type":"_doc","_id":"FSxemm8BLjG_iC0EeTYJ","_score":1.0,"_source":{"a":1}}]}} + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-file-source.md b/site2/website/versioned_docs/version-2.10.x/io-file-source.md new file mode 100644 index 0000000000000..ba0f467a44314 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-file-source.md @@ -0,0 +1,173 @@ +--- +id: io-file-source +title: File source connector +sidebar_label: "File source connector" +original_id: io-file-source +--- + +The File source connector pulls messages from files in directories and persists the messages to Pulsar topics. + +## Configuration + +The configuration of the File source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `inputDirectory` | String|true | No default value|The input directory to pull files. | +| `recurse` | Boolean|false | true | Whether to pull files from subdirectory or not.| +| `keepFile` |Boolean|false | false | If set to true, the file is not deleted after it is processed, which means the file can be picked up continually. | +| `fileFilter` | String|false| [^\\.].* | The file whose name matches the given regular expression is picked up. | +| `pathFilter` | String |false | NULL | If `recurse` is set to true, the subdirectory whose path matches the given regular expression is scanned. | +| `minimumFileAge` | Integer|false | 0 | The minimum age that a file can be processed.

    Any file younger than `minimumFileAge` (according to the last modification date) is ignored. | +| `maximumFileAge` | Long|false |Long.MAX_VALUE | The maximum age that a file can be processed.

    Any file older than `maximumFileAge` (according to last modification date) is ignored. | +| `minimumSize` |Integer| false |1 | The minimum size (in bytes) that a file can be processed. | +| `maximumSize` | Double|false |Double.MAX_VALUE| The maximum size (in bytes) that a file can be processed. | +| `ignoreHiddenFiles` |Boolean| false | true| Whether the hidden files should be ignored or not. | +| `pollingInterval`|Long | false | 10000L | Indicates how long to wait before performing a directory listing. | +| `numWorkers` | Integer | false | 1 | The number of worker threads that process files.

    This allows you to process a larger number of files concurrently.

    However, setting this to a value greater than 1 makes the data from multiple files mixed in the target topic. | +| `processedFileSuffix` | String | false | NULL | If set, do not delete but only rename file that has been processed.

    This config only work when 'keepFile' property is false. | + +### Example + +Before using the File source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "inputDirectory": "/Users/david", + "recurse": true, + "keepFile": true, + "fileFilter": "[^\\.].*", + "pathFilter": "*", + "minimumFileAge": 0, + "maximumFileAge": 9999999999, + "minimumSize": 1, + "maximumSize": 5000000, + "ignoreHiddenFiles": true, + "pollingInterval": 5000, + "numWorkers": 1, + "processedFileSuffix": ".processed_done" + } + } + + ``` + +* YAML + + ```yaml + + configs: + inputDirectory: "/Users/david" + recurse: true + keepFile: true + fileFilter: "[^\\.].*" + pathFilter: "*" + minimumFileAge: 0 + maximumFileAge: 9999999999 + minimumSize: 1 + maximumSize: 5000000 + ignoreHiddenFiles: true + pollingInterval: 5000 + numWorkers: 1 + processedFileSuffix: ".processed_done" + + ``` + +## Usage + +Here is an example of using the File source connecter. + +1. Pull a Pulsar image. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + ``` + +2. Start Pulsar standalone. + + ```bash + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +3. Create a configuration file _file-connector.yaml_. + + ```yaml + + configs: + inputDirectory: "/opt" + + ``` + +4. Copy the configuration file _file-connector.yaml_ to the container. + + ```bash + + $ docker cp connectors/file-connector.yaml pulsar-standalone:/pulsar/ + + ``` + +5. Download the File source connector. + + ```bash + + $ curl -O https://mirrors.tuna.tsinghua.edu.cn/apache/pulsar/pulsar-{version}/connectors/pulsar-io-file-{version}.nar + + ``` + +6. Copy it to the `connectors` folder, then restart the container. + + ```bash + + $ docker cp pulsar-io-file-{version}.nar pulsar-standalone:/pulsar/connectors/ + $ docker restart pulsar-standalone + + ``` + +7. Start the File source connector. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + + $ ./bin/pulsar-admin sources localrun \ + --archive /pulsar/connectors/pulsar-io-file-{version}.nar \ + --name file-test \ + --destination-topic-name pulsar-file-test \ + --source-config-file /pulsar/file-connector.yaml + + ``` + +8. Start a consumer. + + ```bash + + ./bin/pulsar-client consume -s file-test -n 0 pulsar-file-test + + ``` + +9. Write the message to the file _test.txt_. + + ```bash + + echo "hello world!" > /opt/test.txt + + ``` + + The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello world! + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-flume-sink.md b/site2/website/versioned_docs/version-2.10.x/io-flume-sink.md new file mode 100644 index 0000000000000..591681315bc26 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-flume-sink.md @@ -0,0 +1,58 @@ +--- +id: io-flume-sink +title: Flume sink connector +sidebar_label: "Flume sink connector" +original_id: io-flume-sink +--- + +The Flume sink connector pulls messages from Pulsar topics to logs. + +## Configuration + +The configuration of the Flume sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume sink connector, you need to create a configuration file through one of the following methods. + +> For more information about the `sink.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/sink.conf). + +* JSON + + ```json + + { + "configs": { + "name": "a1", + "confFile": "sink.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: sink.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-flume-source.md b/site2/website/versioned_docs/version-2.10.x/io-flume-source.md new file mode 100644 index 0000000000000..ba384560111fd --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-flume-source.md @@ -0,0 +1,58 @@ +--- +id: io-flume-source +title: Flume source connector +sidebar_label: "Flume source connector" +original_id: io-flume-source +--- + +The Flume source connector pulls messages from logs to Pulsar topics. + +## Configuration + +The configuration of the Flume source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume source connector, you need to create a configuration file through one of the following methods. + +> For more information about the `source.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/source.conf). + +* JSON + + ```json + + { + "configs": { + "name": "a1", + "confFile": "source.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: source.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-hbase-sink.md b/site2/website/versioned_docs/version-2.10.x/io-hbase-sink.md new file mode 100644 index 0000000000000..4fcd59a2c2750 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-hbase-sink.md @@ -0,0 +1,69 @@ +--- +id: io-hbase-sink +title: HBase sink connector +sidebar_label: "HBase sink connector" +original_id: io-hbase-sink +--- + +The HBase sink connector pulls the messages from Pulsar topics +and persists the messages to HBase tables + +## Configuration + +The configuration of the HBase sink connector has the following properties. + +### Property + +| Name | Type|Default | Required | Description | +|------|---------|----------|-------------|--- +| `hbaseConfigResources` | String|None | false | HBase system configuration `hbase-site.xml` file. | +| `zookeeperQuorum` | String|None | true | HBase system configuration about `hbase.zookeeper.quorum` value. | +| `zookeeperClientPort` | String|2181 | false | HBase system configuration about `hbase.zookeeper.property.clientPort` value. | +| `zookeeperZnodeParent` | String|/hbase | false | HBase system configuration about `zookeeper.znode.parent` value. | +| `tableName` | None |String | true | HBase table, the value is `namespace:tableName`. | +| `rowKeyName` | String|None | true | HBase table rowkey name. | +| `familyName` | String|None | true | HBase table column family name. | +| `qualifierNames` |String| None | true | HBase table column qualifier names. | +| `batchTimeMs` | Long|1000l| false | HBase table operation timeout in milliseconds. | +| `batchSize` | int|200| false | Batch size of updates made to the HBase table. | + +### Example + +Before using the HBase sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "hbaseConfigResources": "hbase-site.xml", + "zookeeperQuorum": "localhost", + "zookeeperClientPort": "2181", + "zookeeperZnodeParent": "/hbase", + "tableName": "pulsar_hbase", + "rowKeyName": "rowKey", + "familyName": "info", + "qualifierNames": [ 'name', 'address', 'age'] + } + } + + ``` + +* YAML + + ```yaml + + configs: + hbaseConfigResources: "hbase-site.xml" + zookeeperQuorum: "localhost" + zookeeperClientPort: "2181" + zookeeperZnodeParent: "/hbase" + tableName: "pulsar_hbase" + rowKeyName: "rowKey" + familyName: "info" + qualifierNames: [ 'name', 'address', 'age'] + + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/io-hdfs2-sink.md b/site2/website/versioned_docs/version-2.10.x/io-hdfs2-sink.md new file mode 100644 index 0000000000000..54ab3f918bb55 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-hdfs2-sink.md @@ -0,0 +1,66 @@ +--- +id: io-hdfs2-sink +title: HDFS2 sink connector +sidebar_label: "HDFS2 sink connector" +original_id: io-hdfs2-sink +--- + +The HDFS2 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS2 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| true, if `compression` is set to `None`. | None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| true | None | The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. +| `subdirectoryPattern` | String | false | None | A subdirectory associated with the created time of the sink.
    The pattern is the formatted pattern of `directory`'s subdirectory.

    See [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html) for pattern's syntax. | + +### Example + +Before using the HDFS2 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "fileExtension": ".log", + "compression": "SNAPPY", + "subdirectoryPattern": "yyyy-MM-dd" + } + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + fileExtension: ".log" + compression: "SNAPPY" + subdirectoryPattern: "yyyy-MM-dd" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-hdfs3-sink.md b/site2/website/versioned_docs/version-2.10.x/io-hdfs3-sink.md new file mode 100644 index 0000000000000..91f06153d5d77 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-hdfs3-sink.md @@ -0,0 +1,61 @@ +--- +id: io-hdfs3-sink +title: HDFS3 sink connector +sidebar_label: "HDFS3 sink connector" +original_id: io-hdfs3-sink +--- + +The HDFS3 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS3 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| false |None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| false | None| The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. + +### Example + +Before using the HDFS3 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "compression": "SNAPPY" + } + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + compression: "SNAPPY" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-influxdb-sink.md b/site2/website/versioned_docs/version-2.10.x/io-influxdb-sink.md new file mode 100644 index 0000000000000..8492aa482b50a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-influxdb-sink.md @@ -0,0 +1,122 @@ +--- +id: io-influxdb-sink +title: InfluxDB sink connector +sidebar_label: "InfluxDB sink connector" +original_id: io-influxdb-sink +--- + +The InfluxDB sink connector pulls messages from Pulsar topics +and persists the messages to InfluxDB. + +The InfluxDB sink provides different configurations for InfluxDBv1 and v2 respectively. + +## Configuration + +The configuration of the InfluxDB sink connector has the following properties. + +### Property +#### InfluxDBv2 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `token` | String|true| " " (empty string) |The authentication token used to authenticate to InfluxDB. | +| `organization` | String| true|" " (empty string) | The InfluxDB organization to write to. | +| `bucket` |String| true | " " (empty string)| The InfluxDB bucket to write to. | +| `precision` | String|false| ns | The timestamp precision for writing data to InfluxDB.

    Below are the available options:
  • ns
  • us
  • ms
  • s
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +#### InfluxDBv1 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `username` | String|false| " " (empty string) |The username used to authenticate to InfluxDB. | +| `password` | String| false|" " (empty string) | The password used to authenticate to InfluxDB. | +| `database` |String| true | " " (empty string)| The InfluxDB to which write messages. | +| `consistencyLevel` | String|false|ONE | The consistency level for writing data to InfluxDB.

    Below are the available options:
  • ALL
  • ANY
  • ONE
  • QUORUM
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `retentionPolicy` | String|false| autogen| The retention policy for InfluxDB. | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +### Example +Before using the InfluxDB sink connector, you need to create a configuration file through one of the following methods. +#### InfluxDBv2 + +* JSON + + ```json + + { + "configs": { + "influxdbUrl": "http://localhost:9999", + "organization": "example-org", + "bucket": "example-bucket", + "token": "xxxx", + "precision": "ns", + "logLevel": "NONE", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + } + + ``` + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:9999" + organization: "example-org" + bucket: "example-bucket" + token: "xxxx" + precision: "ns" + logLevel: "NONE" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + +#### InfluxDBv1 + +* JSON + + ```json + + { + "configs": { + "influxdbUrl": "http://localhost:8086", + "database": "test_db", + "consistencyLevel": "ONE", + "logLevel": "NONE", + "retentionPolicy": "autogen", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + } + + ``` + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:8086" + database: "test_db" + consistencyLevel: "ONE" + logLevel: "NONE" + retentionPolicy: "autogen" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-jdbc-sink.md b/site2/website/versioned_docs/version-2.10.x/io-jdbc-sink.md new file mode 100644 index 0000000000000..fe03d4a1e441e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-jdbc-sink.md @@ -0,0 +1,165 @@ +--- +id: io-jdbc-sink +title: JDBC sink connector +sidebar_label: "JDBC sink connector" +original_id: io-jdbc-sink +--- + +The JDBC sink connectors allow pulling messages from Pulsar topics +and persists the messages to ClickHouse, MariaDB, PostgreSQL, and SQLite. + +> Currently, INSERT, DELETE and UPDATE operations are supported. + +## Configuration + +The configuration of all JDBC sink connectors has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `userName` | String|false | " " (empty string) | The username used to connect to the database specified by `jdbcUrl`.

    **Note: `userName` is case-sensitive.**| +| `password` | String|false | " " (empty string)| The password used to connect to the database specified by `jdbcUrl`.

    **Note: `password` is case-sensitive.**| +| `jdbcUrl` | String|true | " " (empty string) | The JDBC URL of the database to which the connector connects. | +| `tableName` | String|true | " " (empty string) | The name of the table to which the connector writes. | +| `nonKey` | String|false | " " (empty string) | A comma-separated list contains the fields used in updating events. | +| `key` | String|false | " " (empty string) | A comma-separated list contains the fields used in `where` condition of updating and deleting events. | +| `timeoutMs` | int| false|500 | The JDBC operation timeout in milliseconds. | +| `batchSize` | int|false | 200 | The batch size of updates made to the database. | + +### Example for ClickHouse + +* JSON + + ```json + + { + "configs": { + "userName": "clickhouse", + "password": "password", + "jdbcUrl": "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink", + "tableName": "pulsar_clickhouse_jdbc_sink" + } + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-clickhouse-sink" + topicName: "persistent://public/default/jdbc-clickhouse-topic" + sinkType: "jdbc-clickhouse" + configs: + userName: "clickhouse" + password: "password" + jdbcUrl: "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink" + tableName: "pulsar_clickhouse_jdbc_sink" + + ``` + +### Example for MariaDB + +* JSON + + ```json + + { + "configs": { + "userName": "mariadb", + "password": "password", + "jdbcUrl": "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink", + "tableName": "pulsar_mariadb_jdbc_sink" + } + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-mariadb-sink" + topicName: "persistent://public/default/jdbc-mariadb-topic" + sinkType: "jdbc-mariadb" + configs: + userName: "mariadb" + password: "password" + jdbcUrl: "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink" + tableName: "pulsar_mariadb_jdbc_sink" + + ``` + +### Example for PostgreSQL + +Before using the JDBC PostgreSQL sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "userName": "postgres", + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "tableName": "pulsar_postgres_jdbc_sink" + } + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-postgres-sink" + topicName: "persistent://public/default/jdbc-postgres-topic" + sinkType: "jdbc-postgres" + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +For more information on **how to use this JDBC sink connector**, see [connect Pulsar to PostgreSQL](io-quickstart.md#connect-pulsar-to-postgresql). + +### Example for SQLite + +* JSON + + ```json + + { + "configs": { + "jdbcUrl": "jdbc:sqlite:db.sqlite", + "tableName": "pulsar_sqlite_jdbc_sink" + } + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-sqlite-sink" + topicName: "persistent://public/default/jdbc-sqlite-topic" + sinkType: "jdbc-sqlite" + configs: + jdbcUrl: "jdbc:sqlite:db.sqlite" + tableName: "pulsar_sqlite_jdbc_sink" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-kafka-sink.md b/site2/website/versioned_docs/version-2.10.x/io-kafka-sink.md new file mode 100644 index 0000000000000..ce8967e046107 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-kafka-sink.md @@ -0,0 +1,73 @@ +--- +id: io-kafka-sink +title: Kafka sink connector +sidebar_label: "Kafka sink connector" +original_id: io-kafka-sink +--- + +The Kafka sink connector pulls messages from Pulsar topics and persists the messages +to Kafka topics. + +This guide explains how to configure and use the Kafka sink connector. + +## Configuration + +The configuration of the Kafka sink connector has the following parameters. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +|`acks`|String|true|" " (empty string) |The number of acknowledgments that the producer requires the leader to receive before a request completes.
    This controls the durability of the sent records. +|`batchsize`|long|false|16384L|The batch size that a Kafka producer attempts to batch records together before sending them to brokers. +|`maxRequestSize`|long|false|1048576L|The maximum size of a Kafka request in bytes. +|`topic`|String|true|" " (empty string) |The Kafka topic which receives messages from Pulsar. +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringSerializer | The serializer class for Kafka producers to serialize keys. +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArraySerializer | The serializer class for Kafka producers to serialize values.

    The serializer is set by a specific implementation of [`KafkaAbstractSink`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java). +|`producerConfigProperties`|Map|false|" " (empty string)|The producer configuration properties to be passed to producers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. + + +### Example + +Before using the Kafka sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "bootstrapServers": "localhost:6667", + "topic": "test", + "acks": "1", + "batchSize": "16384", + "maxRequestSize": "1048576", + "producerConfigProperties": { + "client.id": "test-pulsar-producer", + "security.protocol": "SASL_PLAINTEXT", + "sasl.mechanism": "GSSAPI", + "sasl.kerberos.service.name": "kafka", + "acks": "all" + } + } + } + +* YAML + + ``` + +yaml + configs: + bootstrapServers: "localhost:6667" + topic: "test" + acks: "1" + batchSize: "16384" + maxRequestSize: "1048576" + producerConfigProperties: + client.id: "test-pulsar-producer" + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "GSSAPI" + sasl.kerberos.service.name: "kafka" + acks: "all" + ``` diff --git a/site2/website/versioned_docs/version-2.10.x/io-kafka-source.md b/site2/website/versioned_docs/version-2.10.x/io-kafka-source.md new file mode 100644 index 0000000000000..dd6000aa0bd35 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-kafka-source.md @@ -0,0 +1,240 @@ +--- +id: io-kafka-source +title: Kafka source connector +sidebar_label: "Kafka source connector" +original_id: io-kafka-source +--- + +The Kafka source connector pulls messages from Kafka topics and persists the messages +to Pulsar topics. + +This guide explains how to configure and use the Kafka source connector. + +## Configuration + +The configuration of the Kafka source connector has the following properties. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +| `groupId` |String| true | " " (empty string) | A unique string that identifies the group of consumer processes to which this consumer belongs. | +| `fetchMinBytes` | long|false | 1 | The minimum byte expected for each fetch response. | +| `autoCommitEnabled` | boolean |false | true | If set to true, the consumer's offset is periodically committed in the background.

    This committed offset is used when the process fails as the position from which a new consumer begins. | +| `autoCommitIntervalMs` | long|false | 5000 | The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if `autoCommitEnabled` is set to true. | +| `heartbeatIntervalMs` | long| false | 3000 | The interval between heartbeats to the consumer when using Kafka's group management facilities.

    **Note: `heartbeatIntervalMs` must be smaller than `sessionTimeoutMs`**.| +| `sessionTimeoutMs` | long|false | 30000 | The timeout used to detect consumer failures when using Kafka's group management facility. | +| `topic` | String|true | " " (empty string)| The Kafka topic that sends messages to Pulsar. | +| `consumerConfigProperties` | Map| false | " " (empty string) | The consumer configuration properties to be passed to consumers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. | +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringDeserializer | The deserializer class for Kafka consumers to deserialize keys.
    The deserializer is set by a specific implementation of [`KafkaAbstractSource`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java). +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArrayDeserializer | The deserializer class for Kafka consumers to deserialize values. +| `autoOffsetReset` | String | false | earliest | The default offset reset policy. | + +### Schema Management + +This Kafka source connector applies the schema to the topic depending on the data type that is present on the Kafka topic. +You can detect the data type from the `keyDeserializationClass` and `valueDeserializationClass` configuration parameters. + +If the `valueDeserializationClass` is `org.apache.kafka.common.serialization.StringDeserializer`, you can set Schema.STRING() as schema type on the Pulsar topic. + +If `valueDeserializationClass` is `io.confluent.kafka.serializers.KafkaAvroDeserializer`, Pulsar downloads the AVRO schema from the Confluent Schema Registry® +and sets it properly on the Pulsar topic. + +In this case, you need to set `schema.registry.url` inside of the `consumerConfigProperties` configuration entry +of the source. + +If `keyDeserializationClass` is not `org.apache.kafka.common.serialization.StringDeserializer`, it means +that you do not have a String as key and the Kafka Source uses the KeyValue schema type with the SEPARATED encoding. + +Pulsar supports AVRO format for keys. + +In this case, you can have a Pulsar topic with the following properties: +- Schema: KeyValue schema with SEPARATED encoding +- Key: the content of key of the Kafka message (base64 encoded) +- Value: the content of value of the Kafka message +- KeySchema: the schema detected from `keyDeserializationClass` +- ValueSchema: the schema detected from `valueDeserializationClass` + +Topic compaction and partition routing use the Pulsar key, that contains the Kafka key, and so they are driven by the same value that you have on Kafka. + +When you consume data from Pulsar topics, you can use the `KeyValue` schema. In this way, you can decode the data properly. +If you want to access the raw key, you can use the `Message#getKeyBytes()` API. + +### Example + +Before using the Kafka source connector, you need to create a configuration file through one of the following methods. + +- JSON + + ```json + + { + "bootstrapServers": "pulsar-kafka:9092", + "groupId": "test-pulsar-io", + "topic": "my-topic", + "sessionTimeoutMs": "10000", + "autoCommitEnabled": false + } + + ``` + +- YAML + + ```yaml + + configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: false + + ``` + +## Usage + +You can make the Kafka source connector as a Pulsar built-in connector and use it on a standalone cluster or an on-premises cluster. + +### Standalone cluster + +This example describes how to use the Kafka source connector to feed data from Kafka and write data to Pulsar topics in the standalone mode. + +#### Prerequisites + +- Install [Docker](https://docs.docker.com/get-docker/)(Community Edition). + +#### Steps + +1. Download and start the Confluent Platform. + +For details, see the [documentation](https://docs.confluent.io/platform/current/quickstart/ce-docker-quickstart.html#step-1-download-and-start-cp) to install the Kafka service locally. + +2. Pull a Pulsar image and start Pulsar in standalone mode. + + ```bash + + docker pull apachepulsar/pulsar:latest + + docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-kafka-standalone apachepulsar/pulsar:latest bin/pulsar standalone + + ``` + +3. Create a producer file _kafka-producer.py_. + + ```python + + from kafka import KafkaProducer + producer = KafkaProducer(bootstrap_servers='localhost:9092') + future = producer.send('my-topic', b'hello world') + future.get() + + ``` + +4. Create a consumer file _pulsar-client.py_. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', subscription_name='my-aa') + + while True: + msg = consumer.receive() + print msg + print dir(msg) + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +5. Copy the following files to Pulsar. + + ```bash + + docker cp pulsar-io-kafka.nar pulsar-kafka-standalone:/pulsar + docker cp kafkaSourceConfig.yaml pulsar-kafka-standalone:/pulsar/conf + + ``` + +6. Open a new terminal window and start the Kafka source connector in local run mode. + + ```bash + + docker exec -it pulsar-kafka-standalone /bin/bash + + ./bin/pulsar-admin source localrun \ + --archive ./pulsar-io-kafka.nar \ + --tenant public \ + --namespace default \ + --name kafka \ + --destination-topic-name my-topic \ + --source-config-file ./conf/kafkaSourceConfig.yaml \ + --parallelism 1 + + ``` + +7. Open a new terminal window and run the Kafka producer locally. + + ```bash + + python3 kafka-producer.py + + ``` + +8. Open a new terminal window and run the Pulsar consumer locally. + + ```bash + + python3 pulsar-client.py + + ``` + +The following information appears on the consumer terminal window. + + ```bash + + Received message: 'hello world' + + ``` + +### On-premises cluster + +This example explains how to create a Kafka source connector in an on-premises cluster. + +1. Copy the NAR package of the Kafka connector to the Pulsar connectors directory. + + ``` + + cp pulsar-io-kafka-{{connector:version}}.nar $PULSAR_HOME/connectors/pulsar-io-kafka-{{connector:version}}.nar + + ``` + +2. Reload all [built-in connectors](io-connectors.md). + + ``` + + PULSAR_HOME/bin/pulsar-admin sources reload + + ``` + +3. Check whether the Kafka source connector is available on the list or not. + + ``` + + PULSAR_HOME/bin/pulsar-admin sources available-sources + + ``` + +4. Create a Kafka source connector on a Pulsar cluster using the [`pulsar-admin sources create`](/tools/pulsar-admin/) command. + + ``` + + PULSAR_HOME/bin/pulsar-admin sources create \ + --source-config-file + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-kinesis-sink.md b/site2/website/versioned_docs/version-2.10.x/io-kinesis-sink.md new file mode 100644 index 0000000000000..810068958d13f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-kinesis-sink.md @@ -0,0 +1,82 @@ +--- +id: io-kinesis-sink +title: Kinesis sink connector +sidebar_label: "Kinesis sink connector" +original_id: io-kinesis-sink +--- + +The Kinesis sink connector pulls data from Pulsar and persists data into Amazon Kinesis. + +## Configuration + +The configuration of the Kinesis sink connector has the following property. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`messageFormat`|MessageFormat|true|ONLY_RAW_PAYLOAD|Message format in which Kinesis sink converts Pulsar messages and publishes to Kinesis streams.

    Below are the available options:

  • `ONLY_RAW_PAYLOAD`: Kinesis sink directly publishes Pulsar message payload as a message into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_JSON`: Kinesis sink creates a JSON payload with Pulsar message payload, properties and encryptionCtx, and publishes JSON payload into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_FB`: Kinesis sink creates a flatbuffer serialized payload with Pulsar message payload, properties and encryptionCtx, and publishes flatbuffer payload into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_JSON_EXPAND_VALUE`: Kinesis sink sends a JSON structure containing the record topic name, key, payload, properties and event time. The record schema is used to convert the value to JSON.
  • +`retainOrdering`|boolean|false|false|Whether Pulsar connectors to retain ordering when moving messages from Pulsar to Kinesis or not. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    It is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If it is empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`. +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Built-in plugins + +The following are built-in `AwsCredentialProviderPlugin` plugins: + +* `org.apache.pulsar.io.aws.AwsDefaultProviderChainPlugin` + + This plugin takes no configuration, it uses the default AWS provider chain. + + For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +* `org.apache.pulsar.io.aws.STSAssumeRoleProviderPlugin` + + This plugin takes a configuration (via the `awsCredentialPluginParam`) that describes a role to assume when running the KCL. + + This configuration takes the form of a small json document like: + + ```json + + {"roleArn": "arn...", "roleSessionName": "name"} + + ``` + +### Example + +Before using the Kinesis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "awsEndpoint": "some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "messageFormat": "ONLY_RAW_PAYLOAD", + "retainOrdering": "true" + } + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + messageFormat: "ONLY_RAW_PAYLOAD" + retainOrdering: "true" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-kinesis-source.md b/site2/website/versioned_docs/version-2.10.x/io-kinesis-source.md new file mode 100644 index 0000000000000..1b45e264680e6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-kinesis-source.md @@ -0,0 +1,83 @@ +--- +id: io-kinesis-source +title: Kinesis source connector +sidebar_label: "Kinesis source connector" +original_id: io-kinesis-source +--- + +The Kinesis source connector pulls data from Amazon Kinesis and persists data into Pulsar. + +This connector uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual consuming of messages. The KCL uses DynamoDB to track state for consumers. + +> Note: currently, the Kinesis source connector only supports raw messages. If you use KMS encrypted messages, the encrypted messages are sent to downstream. This connector will support decrypting messages in the future release. + + +## Configuration + +The configuration of the Kinesis source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the Amazon Kinesis application.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the Kinesis stream checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`useEnhancedFanOut`|boolean|false|true|If set to true, it uses Kinesis enhanced fan-out.

    If set to false, it uses polling. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the Kinesis source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-mongo-sink.md b/site2/website/versioned_docs/version-2.10.x/io-mongo-sink.md new file mode 100644 index 0000000000000..7fc77ec80cc68 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-mongo-sink.md @@ -0,0 +1,58 @@ +--- +id: io-mongo-sink +title: MongoDB sink connector +sidebar_label: "MongoDB sink connector" +original_id: io-mongo-sink +--- + +The MongoDB sink connector pulls messages from Pulsar topics +and persists the messages to collections. + +## Configuration + +The configuration of the MongoDB sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `mongoUri` | String| true| " " (empty string) | The MongoDB URI to which the connector connects.

    For more information, see [connection string URI format](https://docs.mongodb.com/manual/reference/connection-string/). | +| `database` | String| true| " " (empty string)| The database name to which the collection belongs. | +| `collection` | String| true| " " (empty string)| The collection name to which the connector writes messages. | +| `batchSize` | int|false|100 | The batch size of writing messages to collections. | +| `batchTimeMs` |long|false|1000| The batch operation interval in milliseconds. | + + +### Example + +Before using the Mongo sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "mongoUri": "mongodb://localhost:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": "2", + "batchTimeMs": "500" + } + } + + ``` + +* YAML + + ```yaml + + configs: + mongoUri: "mongodb://localhost:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-netty-source.md b/site2/website/versioned_docs/version-2.10.x/io-netty-source.md new file mode 100644 index 0000000000000..2caedf2bce69b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-netty-source.md @@ -0,0 +1,243 @@ +--- +id: io-netty-source +title: Netty source connector +sidebar_label: "Netty source connector" +original_id: io-netty-source +--- + +The Netty source connector opens a port that accepts incoming data via the configured network protocol +and publish it to user-defined Pulsar topics. + +This connector can be used in a containerized (for example, k8s) deployment. Otherwise, if the connector is running in process or thread mode, the instance may be conflicting on listening to ports. + +## Configuration + +The configuration of the Netty source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `type` |String| true |tcp | The network protocol over which data is transmitted to netty.

    Below are the available options:
  • tcp
  • http
  • udp
  • | +| `host` | String|true | 127.0.0.1 | The host name or address on which the source instance listen. | +| `port` | int|true | 10999 | The port on which the source instance listen. | +| `numberOfThreads` |int| true |1 | The number of threads of Netty TCP server to accept incoming connections and handle the traffic of accepted connections. | + + +### Example + +Before using the Netty source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "type": "tcp", + "host": "127.0.0.1", + "port": "10911", + "numberOfThreads": "1" + } + } + + ``` + +* YAML + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +## Usage + +The following examples show how to use the Netty source connector with TCP and HTTP. + +### TCP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ apt-get update + + $ apt-get -y install telnet + + $ root@1d19327b2c67:/pulsar# telnet 127.0.0.1 10999 + Trying 127.0.0.1... + Connected to 127.0.0.1. + Escape character is '^]'. + hello + world + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello + + ----- got message ----- + world + + ``` + +### HTTP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "http" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ curl -X POST --data 'hello, world!' http://127.0.0.1:10999/ + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello, world! + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-nsq-source.md b/site2/website/versioned_docs/version-2.10.x/io-nsq-source.md new file mode 100644 index 0000000000000..b61e7e100c22e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-nsq-source.md @@ -0,0 +1,21 @@ +--- +id: io-nsq-source +title: NSQ source connector +sidebar_label: "NSQ source connector" +original_id: io-nsq-source +--- + +The NSQ source connector receives messages from NSQ topics +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the NSQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `lookupds` |String| true | " " (empty string) | A comma-separated list of nsqlookupds to connect to. | +| `topic` | String|true | " " (empty string) | The NSQ topic to transport. | +| `channel` | String |false | pulsar-transport-{$topic} | The channel to consume from on the provided NSQ topic. | \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/io-overview.md b/site2/website/versioned_docs/version-2.10.x/io-overview.md new file mode 100644 index 0000000000000..82d0cd04a31d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-overview.md @@ -0,0 +1,163 @@ +--- +id: io-overview +title: Pulsar connector overview +sidebar_label: "Overview" +original_id: io-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Messaging systems are most powerful when you can easily use them with external systems like databases and other messaging systems. + +**Pulsar IO connectors** enable you to easily create, deploy, and manage connectors that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + + +## Concept + +Pulsar IO connectors come in two types: **source** and **sink**. + +This diagram illustrates the relationship between source, Pulsar, and sink: + +![Pulsar IO diagram](/assets/pulsar-io.png "Pulsar IO connectors (sources and sinks)") + + +### Source + +> Sources **feed data from external systems into Pulsar**. + +Common sources include other messaging systems and firehose-style data pipeline APIs. + +For the complete list of Pulsar built-in source connectors, see [source connector](io-connectors.md#source-connector). + +### Sink + +> Sinks **feed data from Pulsar into external systems**. + +Common sinks include other messaging systems and SQL and NoSQL databases. + +For the complete list of Pulsar built-in sink connectors, see [sink connector](io-connectors.md#sink-connector). + +## Processing guarantee + +Processing guarantees are used to handle errors when writing messages to Pulsar topics. + +> Pulsar connectors and Functions use the **same** processing guarantees as below. + +Delivery semantic | Description +:------------------|:------- +`at-most-once` | Each message sent to a connector is to be **processed once** or **not to be processed**. +`at-least-once` | Each message sent to a connector is to be **processed once** or **more than once**. +`effectively-once` | Each message sent to a connector has **one output associated** with it. + +> Processing guarantees for connectors not just rely on Pulsar guarantee but also **relate to external systems**, that is, **the implementation of source and sink**. + +* Source: Pulsar ensures that writing messages to Pulsar topics respects to the processing guarantees. It is within Pulsar's control. + +* Sink: the processing guarantees rely on the sink implementation. If the sink implementation does not handle retries in an idempotent way, the sink does not respect to the processing guarantees. + +### Set + +When creating a connector, you can set the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +> If `--processing-guarantees` is not specified when creating a connector, the default semantic is `ATLEAST_ONCE`. + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources create \ + --processing-guarantees ATMOST_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources create`, see [here](reference-connector-admin.md#create). + + + + +```bash + +$ bin/pulsar-admin sinks create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks create`, see [here](reference-connector-admin.md#create-1). + + + + +```` + +### Update + +After creating a connector, you can update the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources update \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources update`, see [here](reference-connector-admin.md#update). + + + + +```bash + +$ bin/pulsar-admin sinks update \ + --processing-guarantees ATMOST_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks update`, see [here](reference-connector-admin.md#update-1). + + + + +```` + + +## Work with connector + +You can manage Pulsar connectors (for example, create, update, start, stop, restart, reload, delete and perform other operations on connectors) via the `Connector Admin CLI` with sources and sinks subcommands. For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + +Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via the `Connector Admin CLI` or [Functions Admin CLI](functions-cli.md), an instance is started on a worker. For more information, see [Functions worker](functions-worker.md#run-functions-worker-separately). diff --git a/site2/website/versioned_docs/version-2.10.x/io-quickstart.md b/site2/website/versioned_docs/version-2.10.x/io-quickstart.md new file mode 100644 index 0000000000000..1b6528d49541b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-quickstart.md @@ -0,0 +1,963 @@ +--- +id: io-quickstart +title: How to connect Pulsar to database +sidebar_label: "Get started" +original_id: io-quickstart +--- + +This tutorial provides a hands-on look at how you can move data out of Pulsar without writing a single line of code. + +It is helpful to review the [concepts](io-overview.md) for Pulsar I/O with running the steps in this guide to gain a deeper understanding. + +At the end of this tutorial, you are able to: + +- [Connect Pulsar to Cassandra](#connect-pulsar-to-cassandra) + +- [Connect Pulsar to PostgreSQL](#connect-pulsar-to-postgreSQL) + +:::tip + +* These instructions assume you are running Pulsar in [standalone mode](getting-started-standalone.md). However, all +the commands used in this tutorial can be used in a multi-node Pulsar cluster without any changes. +* All the instructions are assumed to run at the root directory of a Pulsar binary distribution. + +::: + +## Install Pulsar and built-in connector + +Before connecting Pulsar to a database, you need to install Pulsar and the desired built-in connector. + +For more information about **how to install a standalone Pulsar and built-in connectors**, see [here](getting-started-standalone.md/#installing-pulsar). + +## Start Pulsar standalone + +1. Start Pulsar locally. + + ```bash + + bin/pulsar standalone + + ``` + + All the components of a Pulsar service are started in order. + + You can curl those pulsar service endpoints to make sure Pulsar service is up and running correctly. + +2. Check Pulsar binary protocol port. + + ```bash + + telnet localhost 6650 + + ``` + +3. Check Pulsar Function cluster. + + ```bash + + curl -s http://localhost:8080/admin/v2/worker/cluster + + ``` + + **Example output** + + ```json + + [{"workerId":"c-standalone-fw-localhost-6750","workerHostname":"localhost","port":6750}] + + ``` + +4. Make sure a public tenant and a default namespace exist. + + ```bash + + curl -s http://localhost:8080/admin/v2/namespaces/public + + ``` + + **Example output** + + ```json + + ["public/default","public/functions"] + + ``` + +5. All built-in connectors should be listed as available. + + ```bash + + curl -s http://localhost:8080/admin/v2/functions/connectors + + ``` + + **Example output** + + ```json + + [{"name":"aerospike","description":"Aerospike database sink","sinkClass":"org.apache.pulsar.io.aerospike.AerospikeStringSink"},{"name":"cassandra","description":"Writes data into Cassandra","sinkClass":"org.apache.pulsar.io.cassandra.CassandraStringSink"},{"name":"kafka","description":"Kafka source and sink connector","sourceClass":"org.apache.pulsar.io.kafka.KafkaStringSource","sinkClass":"org.apache.pulsar.io.kafka.KafkaBytesSink"},{"name":"kinesis","description":"Kinesis sink connector","sinkClass":"org.apache.pulsar.io.kinesis.KinesisSink"},{"name":"rabbitmq","description":"RabbitMQ source connector","sourceClass":"org.apache.pulsar.io.rabbitmq.RabbitMQSource"},{"name":"twitter","description":"Ingest data from Twitter firehose","sourceClass":"org.apache.pulsar.io.twitter.TwitterFireHose"}] + + ``` + + If an error occurs when starting Pulsar service, you may see an exception at the terminal running `pulsar/standalone`, + or you can navigate to the `logs` directory under the Pulsar directory to view the logs. + +## Connect Pulsar to Cassandra + +This section demonstrates how to connect Pulsar to Cassandra. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The Cassandra sink connector reads messages from Pulsar topics and writes the messages into Cassandra tables. For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +::: + +### Setup a Cassandra cluster + +This example uses `cassandra` Docker image to start a single-node Cassandra cluster in Docker. + +1. Start a Cassandra cluster. + + ```bash + + docker run -d --rm --name=cassandra -p 9042:9042 cassandra + + ``` + + :::note + + Before moving to the next steps, make sure the Cassandra cluster is running. + + ::: + +2. Make sure the Docker process is running. + + ```bash + + docker ps + + ``` + +3. Check the Cassandra logs to make sure the Cassandra process is running as expected. + + ```bash + + docker logs cassandra + + ``` + +4. Check the status of the Cassandra cluster. + + ```bash + + docker exec cassandra nodetool status + + ``` + + **Example output** + + ``` + + Datacenter: datacenter1 + ======================= + Status=Up/Down + |/ State=Normal/Leaving/Joining/Moving + -- Address Load Tokens Owns (effective) Host ID Rack + UN 172.17.0.2 103.67 KiB 256 100.0% af0e4b2f-84e0-4f0b-bb14-bd5f9070ff26 rack1 + + ``` + +5. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + $ docker exec -ti cassandra cqlsh localhost + Connected to Test Cluster at localhost:9042. + [cqlsh 5.0.1 | Cassandra 3.11.2 | CQL spec 3.4.4 | Native protocol v4] + Use HELP for help. + cqlsh> + + ``` + +6. Create a keyspace `pulsar_test_keyspace`. + + ```bash + + cqlsh> CREATE KEYSPACE pulsar_test_keyspace WITH replication = {'class':'SimpleStrategy', 'replication_factor':1}; + + ``` + +7. Create a table `pulsar_test_table`. + + ```bash + + cqlsh> USE pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> CREATE TABLE pulsar_test_table (key text PRIMARY KEY, col text); + + ``` + +### Configure a Cassandra sink + +Now that we have a Cassandra cluster running locally. + +In this section, you need to configure a Cassandra sink connector. + +To run a Cassandra sink connector, you need to prepare a configuration file including the information that Pulsar connector runtime needs to know. + +For example, how Pulsar connector can find the Cassandra cluster, what is the keyspace and the table that Pulsar connector uses for writing Pulsar messages to, and so on. + +You can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + + ``` + +* YAML + + ```yaml + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +### Create a Cassandra sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to create a sink connector and perform other operations on them. + +Run the following command to create a Cassandra sink connector with sink type _cassandra_ and the config file _examples/cassandra-sink.yml_ created previously. + +#### Note +> The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. + +```bash + +bin/pulsar-admin sinks create \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink \ + --sink-type cassandra \ + --sink-config-file examples/cassandra-sink.yml \ + --inputs test_cassandra + +``` + +Once the command is executed, Pulsar creates the sink connector _cassandra-test-sink_. + +This sink connector runs +as a Pulsar Function and writes the messages produced in the topic _test_cassandra_ to the Cassandra table _pulsar_test_table_. + +### Inspect a Cassandra sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to monitor a connector and perform other operations on it. + +* Get the information of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "cassandra-test-sink", + "className": "org.apache.pulsar.io.cassandra.CassandraStringSink", + "inputSpecs": { + "test_cassandra": { + "isRegexPattern": false + } + }, + "configs": { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true, + "archive": "builtin://cassandra" + } + + ``` + +* Check the status of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +### Verify a Cassandra sink + +1. Produce some messages to the input topic of the Cassandra sink _test_cassandra_. + + ```bash + + for i in {0..9}; do bin/pulsar-client produce -m "key-$i" -n 1 test_cassandra; done + + ``` + +2. Inspect the status of the Cassandra sink _test_cassandra_. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + You can see 10 messages are processed by the Cassandra sink _test_cassandra_. + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 10, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 10, + "lastReceivedTime" : 1551685489136, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +3. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + docker exec -ti cassandra cqlsh localhost + + ``` + +4. Check the data of the Cassandra table _pulsar_test_table_. + + ```bash + + cqlsh> use pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> select * from pulsar_test_table; + + key | col + --------+-------- + key-5 | key-5 + key-0 | key-0 + key-9 | key-9 + key-2 | key-2 + key-1 | key-1 + key-3 | key-3 + key-6 | key-6 + key-7 | key-7 + key-4 | key-4 + key-8 | key-8 + + ``` + +### Delete a Cassandra Sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to delete a connector and perform other operations on it. + +```bash + +bin/pulsar-admin sinks delete \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + +``` + +## Connect Pulsar to PostgreSQL + +This section demonstrates how to connect Pulsar to PostgreSQL. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The JDBC sink connector pulls messages from Pulsar topics and persists the messages to ClickHouse, MariaDB, PostgreSQL, or SQlite. + +::: + +>For more information, see [JDBC sink connector](io-jdbc-sink.md). + + +### Setup a PostgreSQL cluster + +This example uses the PostgreSQL 12 docker image to start a single-node PostgreSQL cluster in Docker. + +1. Pull the PostgreSQL 12 image from Docker. + + ```bash + + $ docker pull postgres:12 + + ``` + +2. Start PostgreSQL. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=password \ + -e POSTGRES_USER=postgres \ + postgres:12 + + ``` + + #### Tip + + Flag | Description | This example + ---|---|---| + `-d` | To start a container in detached mode. | / + `-it` | Keep STDIN open even if not attached and allocate a terminal. | / + `--rm` | Remove the container automatically when it exits. | / + `-name` | Assign a name to the container. | This example specifies _pulsar-postgres_ for the container. + `-p` | Publish the port of the container to the host. | This example publishes the port _5432_ of the container to the host. + `-e` | Set environment variables. | This example sets the following variables:
    - The password for the user is _password_.
    - The name for the user is _postgres_. + + :::tip + + For more information about Docker commands, see [Docker CLI](https://docs.docker.com/engine/reference/commandline/run/). + + ::: + +3. Check if PostgreSQL has been started successfully. + + ```bash + + $ docker logs -f pulsar-postgres + + ``` + + PostgreSQL has been started successfully if the following message appears. + + ```text + + 2020-05-11 20:09:24.492 UTC [1] LOG: starting PostgreSQL 12.2 (Debian 12.2-2.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv4 address "0.0.0.0", port 5432 + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv6 address "::", port 5432 + 2020-05-11 20:09:24.499 UTC [1] LOG: listening on Unix socket "/var/run/postgresql/.s.PGSQL.5432" + 2020-05-11 20:09:24.523 UTC [55] LOG: database system was shut down at 2020-05-11 20:09:24 UTC + 2020-05-11 20:09:24.533 UTC [1] LOG: database system is ready to accept connections + + ``` + +4. Access to PostgreSQL. + + ```bash + + $ docker exec -it pulsar-postgres /bin/bash + + ``` + +5. Create a PostgreSQL table _pulsar_postgres_jdbc_sink_. + + ```bash + + $ psql -U postgres postgres + + postgres=# create table if not exists pulsar_postgres_jdbc_sink + ( + id serial PRIMARY KEY, + name VARCHAR(255) NOT NULL + ); + + ``` + +### Configure a JDBC sink + +Now we have a PostgreSQL running locally. + +In this section, you need to configure a JDBC sink connector. + +1. Add a configuration file. + + To run a JDBC sink connector, you need to prepare a YAML configuration file including the information that Pulsar connector runtime needs to know. + + For example, how Pulsar connector can find the PostgreSQL cluster, what is the JDBC URL and the table that Pulsar connector uses for writing messages. + + Create a _pulsar-postgres-jdbc-sink.yaml_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```yaml + + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/postgres" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +2. Create a schema. + + Create a _avro-schema_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```json + + { + "type": "AVRO", + "schema": "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}", + "properties": {} + } + + ``` + + :::tip + + For more information about AVRO, see [Apache Avro](https://avro.apache.org/docs/1.9.1/). + + ::: + +3. Upload a schema to a topic. + + This example uploads the _avro-schema_ schema to the _pulsar-postgres-jdbc-sink-topic_ topic. + + ```bash + + $ bin/pulsar-admin schemas upload pulsar-postgres-jdbc-sink-topic -f ./connectors/avro-schema + + ``` + +4. Check if the schema has been uploaded successfully. + + ```bash + + $ bin/pulsar-admin schemas get pulsar-postgres-jdbc-sink-topic + + ``` + + The schema has been uploaded successfully if the following message appears. + + ```json + + {"name":"pulsar-postgres-jdbc-sink-topic","schema":"{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}","type":"AVRO","properties":{}} + + ``` + +### Create a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to create a sink connector and perform other operations on it. + +This example creates a sink connector and specifies the desired information. + +```bash + +$ bin/pulsar-admin sinks create \ +--archive ./connectors/pulsar-io-jdbc-postgres-@pulsar:version@.nar \ +--inputs pulsar-postgres-jdbc-sink-topic \ +--name pulsar-postgres-jdbc-sink \ +--sink-config-file ./connectors/pulsar-postgres-jdbc-sink.yaml \ +--parallelism 1 + +``` + +Once the command is executed, Pulsar creates a sink connector _pulsar-postgres-jdbc-sink_. + +This sink connector runs as a Pulsar Function and writes the messages produced in the topic _pulsar-postgres-jdbc-sink-topic_ to the PostgreSQL table _pulsar_postgres_jdbc_sink_. + + #### Tip + + Flag | Description | This example + ---|---|---| + `--archive` | The path to the archive file for the sink. | _pulsar-io-jdbc-postgres-@pulsar:version@.nar_ | + `--inputs` | The input topic(s) of the sink.

    Multiple topics can be specified as a comma-separated list.|| + `--name` | The name of the sink. | _pulsar-postgres-jdbc-sink_ | + `--sink-config-file` | The path to a YAML config file specifying the configuration of the sink. | _pulsar-postgres-jdbc-sink.yaml_ | + `--parallelism` | The parallelism factor of the sink.

    For example, the number of sink instances to run. | _1_ | + +:::tip + +For more information about `pulsar-admin sinks create options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The sink has been created successfully if the following message appears. + +```bash + +Created successfully + +``` + +### Inspect a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to monitor a connector and perform other operations on it. + +* List all running JDBC sink(s). + + ```bash + + $ bin/pulsar-admin sinks list \ + --tenant public \ + --namespace default + + ``` + + :::tip + + For more information about `pulsar-admin sinks list options`, see [Pulsar admin docs](/tools/pulsar-admin/). + + ::: + + The result shows that only the _postgres-jdbc-sink_ sink is running. + + ```json + + [ + "pulsar-postgres-jdbc-sink" + ] + + ``` + +* Get the information of a JDBC sink. + + ```bash + + $ bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks get options`, see [Pulsar admin docs](/tools/pulsar-admin/). + + ::: + + The result shows the information of the sink connector, including tenant, namespace, topic and so on. + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true + } + + ``` + +* Get the status of a JDBC sink + + ```bash + + $ bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks status options`, see [Pulsar admin docs](/tools/pulsar-admin/). + + ::: + + The result shows the current status of sink connector, including the number of instances, running status, worker ID and so on. + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-192.168.2.52-8080" + } + } ] + } + + ``` + +### Stop a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to stop a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks stop \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks stop options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The sink instance has been stopped successfully if the following message disappears. + +```bash + +Stopped successfully + +``` + +### Restart a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to restart a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks restart \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks restart options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The sink instance has been started successfully if the following message disappears. + +```bash + +Started successfully + +``` + +:::tip + +* Optionally, you can run a standalone sink connector using `pulsar-admin sinks localrun options`. +Note that `pulsar-admin sinks localrun options` **runs a sink connector locally**, while `pulsar-admin sinks start options` **starts a sink connector in a cluster**. +* For more information about `pulsar-admin sinks localrun options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +### Update a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to update a connector and perform other operations on it. + +This example updates the parallelism of the _pulsar-postgres-jdbc-sink_ sink connector to 2. + +```bash + +$ bin/pulsar-admin sinks update \ +--name pulsar-postgres-jdbc-sink \ +--parallelism 2 + +``` + +:::tip + +For more information about `pulsar-admin sinks update options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The sink connector has been updated successfully if the following message disappears. + +```bash + +Updated successfully + +``` + +This example double-checks the information. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the parallelism is 2. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 2, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +### Delete a JDBC sink + +You can use the [Connector Admin CLI](/tools/pulsar-admin/) +to delete a connector and perform other operations on it. + +This example deletes the _pulsar-postgres-jdbc-sink_ sink connector. + +```bash + +$ bin/pulsar-admin sinks delete \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks delete options`, see [Pulsar admin docs](/tools/pulsar-admin/). + +::: + +The sink connector has been deleted successfully if the following message appears. + +```text + +Deleted successfully + +``` + +This example double-checks the status of the sink connector. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the sink connector does not exist. + +```text + +HTTP 404 Not Found + +Reason: Sink pulsar-postgres-jdbc-sink doesn't exist + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-sink.md b/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-sink.md new file mode 100644 index 0000000000000..1bf8b7bd5c83a --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-sink.md @@ -0,0 +1,87 @@ +--- +id: io-rabbitmq-sink +title: RabbitMQ sink connector +sidebar_label: "RabbitMQ sink connector" +original_id: io-rabbitmq-sink +--- + +The RabbitMQ sink connector pulls messages from Pulsar topics +and persist the messages to RabbitMQ queues. + + +## Configuration + +The configuration of the RabbitMQ sink connector has the following properties. + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The exchange to publish messages. | +| `exchangeName` | String|true | " " (empty string) | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` |String|true | " " (empty string) |The routing key used to publish messages. | + + +### Example + +Before using the RabbitMQ sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "exchangeName": "test-exchange", + "routingKey": "test-key" + } + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/", + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + exchangeName: "test-exchange" + routingKey: "test-key" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-source.md b/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-source.md new file mode 100644 index 0000000000000..0dbf51e15856e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-rabbitmq-source.md @@ -0,0 +1,87 @@ +--- +id: io-rabbitmq-source +title: RabbitMQ source connector +sidebar_label: "RabbitMQ source connector" +original_id: io-rabbitmq-source +--- + +The RabbitMQ source connector receives messages from RabbitMQ clusters +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the RabbitMQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The requested heartbeat timeout in seconds. | +| `prefetchCount` | int|false | 0 | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` | boolean|false | false |Whether the setting should be applied to the entire channel rather than each consumer. | +| `passive` | boolean|false | false | Whether the rabbitmq consumer should create its own queue or bind to an existing one. | + +### Example + +Before using the RabbitMQ source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "prefetchCount": "0", + "prefetchGlobal": "false", + "passive": "false" + } + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/" + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + prefetchCount: 0 + prefetchGlobal: "false" + passive: "false" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-redis-sink.md b/site2/website/versioned_docs/version-2.10.x/io-redis-sink.md new file mode 100644 index 0000000000000..9efd6ed863769 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-redis-sink.md @@ -0,0 +1,158 @@ +--- +id: io-redis-sink +title: Redis sink connector +sidebar_label: "Redis sink connector" +original_id: io-redis-sink +--- + +The Redis sink connector pulls messages from Pulsar topics +and persists the messages to a Redis database. + + + +## Configuration + +The configuration of the Redis sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `redisHosts` |String|true|" " (empty string) | A comma-separated list of Redis hosts to connect to. | +| `redisPassword` |String|false|" " (empty string) | The password used to connect to Redis. | +| `redisDatabase` | int|true|0 | The Redis database to connect to. | +| `clientMode` |String| false|Standalone | The client mode when interacting with Redis cluster.

    Below are the available options:
  • Standalone
  • Cluster
  • | +| `autoReconnect` | boolean|false|true | Whether the Redis client automatically reconnect or not. | +| `requestQueue` | int|false|2147483647 | The maximum number of queued requests to Redis. | +| `tcpNoDelay` |boolean| false| false | Whether to enable TCP with no delay or not. | +| `keepAlive` | boolean|false | false |Whether to enable a keepalive to Redis or not. | +| `connectTimeout` |long| false|10000 | The time to wait before timing out when connecting in milliseconds. | +| `operationTimeout` | long|false|10000 | The time before an operation is marked as timed out in milliseconds . | +| `batchTimeMs` | int|false|1000 | The Redis operation time in milliseconds. | +| `batchSize` | int|false|200 | The batch size of writing to Redis database. | + + +### Example + +Before using the Redis sink connector, you need to create a configuration file in the path you will start Pulsar service (i.e. `PULSAR_HOME`) through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "redisHosts": "localhost:6379", + "redisPassword": "mypassword", + "redisDatabase": "0", + "clientMode": "Standalone", + "operationTimeout": "2000", + "batchSize": "1", + "batchTimeMs": "1000", + "connectTimeout": "3000" + } + } + + ``` + +* YAML + + ```yaml + + configs: + redisHosts: "localhost:6379" + redisPassword: "mypassword" + redisDatabase: 0 + clientMode: "Standalone" + operationTimeout: 2000 + batchSize: 1 + batchTimeMs: 1000 + connectTimeout: 3000 + + ``` + +### Usage + +This example shows how to write records to a Redis database using the Pulsar Redis connector. + +1. Start a Redis server. + + ```bash + + $ docker pull redis:5.0.5 + $ docker run -d -p 6379:6379 --name my-redis redis:5.0.5 --requirepass "mypassword" + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + + Make sure the NAR file is available at `connectors/pulsar-io-redis-@pulsar:version@.nar`. + +3. Start the Pulsar Redis connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-redis-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name my-redis-sink \ + --sink-config '{"redisHosts": "localhost:6379","redisPassword": "mypassword","redisDatabase": "0","clientMode": "Standalone","operationTimeout": "3000","batchSize": "1"}' \ + --inputs my-redis-topic + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-redis-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name my-redis-sink \ + --sink-config-file redis-sink-config.yaml \ + --inputs my-redis-topic + + ``` + +4. Publish records to the topic. + + ```bash + + $ bin/pulsar-client produce \ + persistent://public/default/my-redis-topic \ + -k "streaming" \ + -m "Pulsar" + + ``` + +5. Start a Redis client in Docker. + + ```bash + + $ docker exec -it my-redis redis-cli -a "mypassword" + + ``` + +6. Check the key/value in Redis. + + ``` + + 127.0.0.1:6379> keys * + 1) "streaming" + 127.0.0.1:6379> get "streaming" + "Pulsar" + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-solr-sink.md b/site2/website/versioned_docs/version-2.10.x/io-solr-sink.md new file mode 100644 index 0000000000000..d8b09db61faef --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-solr-sink.md @@ -0,0 +1,67 @@ +--- +id: io-solr-sink +title: Solr sink connector +sidebar_label: "Solr sink connector" +original_id: io-solr-sink +--- + +The Solr sink connector pulls messages from Pulsar topics +and persists the messages to Solr collections. + + + +## Configuration + +The configuration of the Solr sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `solrUrl` | String|true|" " (empty string) |
  • Comma-separated zookeeper hosts with chroot used in the SolrCloud mode.
    **Example**
    `localhost:2181,localhost:2182/chroot`

  • URL to connect to Solr used in standalone mode.
    **Example**
    `localhost:8983/solr`
  • | +| `solrMode` | String|true|SolrCloud| The client mode when interacting with the Solr cluster.

    Below are the available options:
  • Standalone
  • SolrCloud
  • | +| `solrCollection` |String|true| " " (empty string) | Solr collection name to which records need to be written. | +| `solrCommitWithinMs` |int| false|10 | The time within million seconds for Solr updating commits.| +| `username` |String|false| " " (empty string) | The username for basic authentication.

    **Note: `usename` is case-sensitive.** | +| `password` | String|false| " " (empty string) | The password for basic authentication.

    **Note: `password` is case-sensitive.** | + + + +### Example + +Before using the Solr sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "configs": { + "solrUrl": "localhost:2181,localhost:2182/chroot", + "solrMode": "SolrCloud", + "solrCollection": "techproducts", + "solrCommitWithinMs": 100, + "username": "fakeuser", + "password": "fake@123" + } + } + + ``` + +* YAML + + ```yaml + + { + solrUrl: "localhost:2181,localhost:2182/chroot" + solrMode: "SolrCloud" + solrCollection: "techproducts" + solrCommitWithinMs: 100 + username: "fakeuser" + password: "fake@123" + } + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/io-twitter-source.md b/site2/website/versioned_docs/version-2.10.x/io-twitter-source.md new file mode 100644 index 0000000000000..8de3504dd0fef --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-twitter-source.md @@ -0,0 +1,28 @@ +--- +id: io-twitter-source +title: Twitter Firehose source connector +sidebar_label: "Twitter Firehose source connector" +original_id: io-twitter-source +--- + +The Twitter Firehose source connector receives tweets from Twitter Firehose and +writes the tweets to Pulsar topics. + +## Configuration + +The configuration of the Twitter Firehose source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `consumerKey` | String|true | " " (empty string) | The twitter OAuth consumer key.

    For more information, see [Access tokens](https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens). | +| `consumerSecret` | String |true | " " (empty string) | The twitter OAuth consumer secret. | +| `token` | String|true | " " (empty string) | The twitter OAuth token. | +| `tokenSecret` | String|true | " " (empty string) | The twitter OAuth secret. | +| `guestimateTweetTime`|Boolean|false|false|Most firehose events have null createdAt time.

    If `guestimateTweetTime` set to true, the connector estimates the createdTime of each firehose event to be current time. +| `clientName` | String |false | openconnector-twitter-source| The twitter firehose client name. | +| `clientHosts` |String| false | Constants.STREAM_HOST | The twitter firehose hosts to which client connects. | +| `clientBufferSize` | int|false | 50000 | The buffer size for buffering tweets fetched from twitter firehose. | + +> For more information about OAuth credentials, see [Twitter developers portal](https://developer.twitter.com/en.html). diff --git a/site2/website/versioned_docs/version-2.10.x/io-twitter.md b/site2/website/versioned_docs/version-2.10.x/io-twitter.md new file mode 100644 index 0000000000000..3b2f6325453c3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-twitter.md @@ -0,0 +1,7 @@ +--- +id: io-twitter +title: Twitter Firehose Connector +sidebar_label: "Twitter Firehose Connector" +original_id: io-twitter +--- + diff --git a/site2/website/versioned_docs/version-2.10.x/io-use.md b/site2/website/versioned_docs/version-2.10.x/io-use.md new file mode 100644 index 0000000000000..5746faea4eaff --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/io-use.md @@ -0,0 +1,1787 @@ +--- +id: io-use +title: How to use Pulsar connectors +sidebar_label: "Use" +original_id: io-use +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to use Pulsar connectors. + +## Install a connector + +Pulsar bundles several [builtin connectors](io-connectors.md) used to move data in and out of commonly used systems (such as database and messaging system). Optionally, you can create and use your desired non-builtin connectors. + +:::note + +When using a non-builtin connector, you need to specify the path of an archive file for the connector. + +::: + +To set up a builtin connector, follow +the instructions [here](getting-started-standalone.md#installing-builtin-connectors). + +After the setup, the builtin connector is automatically discovered by Pulsar brokers (or function-workers), so no additional installation steps are required. + +## Configure a connector + +You can configure the following information: + +* [Configure a default storage location for a connector](#configure-a-default-storage-location-for-a-connector) + +* [Configure a connector with a YAML file](#configure-a-connector-with-yaml-file) + +### Configure a default storage location for a connector + +To configure a default folder for builtin connectors, set the `connectorsDirectory` parameter in the `./conf/functions_worker.yml` configuration file. + +**Example** + +Set the `./connectors` folder as the default storage location for builtin connectors. + +``` + +######################## +# Connectors +######################## + +connectorsDirectory: ./connectors + +``` + +### Configure a connector with a YAML file + +To configure a connector, you need to provide a YAML configuration file when creating a connector. + +The YAML configuration file tells Pulsar where to locate connectors and how to connect connectors with Pulsar topics. + +**Example 1** + +Below is a YAML configuration file of a Cassandra sink, which tells Pulsar: + +* Which Cassandra cluster to connect + +* What is the `keyspace` and `columnFamily` to be used in Cassandra for collecting data + +* How to map Pulsar messages into Cassandra table key and columns + +```shell + +tenant: public +namespace: default +name: cassandra-test-sink +... +# cassandra specific config +configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + +``` + +**Example 2** + +Below is a YAML configuration file of a Kafka source. + +```shell + +configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: "false" + +``` + +**Example 3** + +Below is a YAML configuration file of a PostgreSQL JDBC sink. + +```shell + +configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/test_jdbc" + tableName: "test_jdbc" + +``` + +## Get available connectors + +Before starting using connectors, you can perform the following operations: + +* [Reload connectors](#reload) + +* [Get a list of available connectors](#get-available-connectors) + +### `reload` + +If you add or delete a nar file in a connector folder, reload the available builtin connector before using it. + +#### Source + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sources reload + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + +#### Sink + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sinks reload + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + +### `available` + +After reloading connectors (optional), you can get a list of available connectors. + +#### Source + +Use the `available-sources` subcommand. + +```shell + +$ pulsar-admin sources available-sources + +``` + +#### Sink + +Use the `available-sinks` subcommand. + +```shell + +$ pulsar-admin sinks available-sinks + +``` + +## Run a connector + +To run a connector, you can perform the following operations: + +* [Create a connector](#create) + +* [Start a connector](#start) + +* [Run a connector locally](#localrun) + +### `create` + +You can create a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Create a source connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sources create options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/registerSource?version=@pulsar:version_number@} + + + + +* Create a source connector with a **local file**. + + ```java + + void createSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#createSource-SourceConfig-java.lang.String-). + +* Create a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sourceConfig` | The source configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSourceWithUrl`](/api/admin/org/apache/pulsar/client/admin/Source.html#createSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Create a sink connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sinks create options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/registerSink?version=@pulsar:version_number@} + + + + +* Create a sink connector with a **local file**. + + ```java + + void createSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#createSink-SinkConfig-java.lang.String-). + +* Create a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sinkConfig` | The sink configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSinkWithUrl`](/api/admin/org/apache/pulsar/client/admin/Sink.html#createSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +### `start` + +You can start a connector using **Admin CLI** or **REST API**. + +#### Source + +Start a source connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sources start options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Start **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/start|operation/startSource?version=@pulsar:version_number@} + +* Start a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSource?version=@pulsar:version_number@} + + + + +```` + +#### Sink + +Start a sink connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sinks start options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Start **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/start|operation/startSink?version=@pulsar:version_number@} + +* Start a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSink?version=@pulsar:version_number@} + + + + +```` + +### `localrun` + +You can run a connector locally rather than deploying it on a Pulsar cluster using **Admin CLI**. + +#### Source + +Run a source connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sources localrun options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +```` + +#### Sink + +Run a sink connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sinks localrun options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +```` + +## Monitor a connector + +To monitor a connector, you can perform the following operations: + +* [Get the information of a connector](#get) + +* [Get the list of all running connectors](#list) + +* [Get the current status of a connector](#status) + +### `get` + +You can get the information of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the information of a source connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sources get options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/getSourceInfo?version=@pulsar:version_number@} + + + + +```java + +SourceConfig getSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Example** + +This is a sourceConfig. + +```java + +{ + "tenant": "tenantName", + "namespace": "namespaceName", + "name": "sourceName", + "className": "className", + "topicName": "topicName", + "configs": {}, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +This is a sourceConfig example. + +``` + +{ + "tenant": "public", + "namespace": "default", + "name": "debezium-mysql-source", + "className": "org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource", + "topicName": "debezium-mysql-topic", + "configs": { + "database.user": "debezium", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.port": "3306", + "database.hostname": "localhost", + "database.password": "dbz", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.whitelist": "inventory", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "database.history.pulsar.topic": "history-topic2" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException.NotFoundException` | Cluster doesn't exist +`PulsarAdminException` | Unexpected error + +For more information, see [`getSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the information of a sink connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sinks get options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/getSinkInfo?version=@pulsar:version_number@} + + + + +```java + +SinkConfig getSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + +``` + +**Example** + +This is a sinkConfig. + +```json + +{ +"tenant": "tenantName", +"namespace": "namespaceName", +"name": "sinkName", +"className": "className", +"inputSpecs": { +"topicName": { + "isRegexPattern": false +} +}, +"configs": {}, +"parallelism": 1, +"processingGuarantees": "ATLEAST_ONCE", +"retainOrdering": false, +"autoAck": true +} + +``` + +This is a sinkConfig example. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +**Parameter description** + +Name| Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +For more information, see [`getSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#getSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +### `list` + +You can get the list of all running connectors using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the list of all running source connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sources list options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace|operation/listSources?version=@pulsar:version_number@} + + + + +```java + +List listSources(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#listSources-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the list of all running sink connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sinks list options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace|operation/listSinks?version=@pulsar:version_number@} + + + + +```java + +List listSinks(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](/api/admin/org/apache/pulsar/client/admin/Sink.html#listSinks-java.lang.String-java.lang.String-). + + + + +```` + +### `status` + +You can get the current status of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the current status of a source connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sources status options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Get the current status of **all** source connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/status|operation/getSourceStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** source connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSourceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** source connectors. + + ```java + + SourceStatus getSourceStatus(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SourceStatus.SourceInstanceStatus.SourceInstanceStatusData getSourceStatus(String tenant, + String namespace, + String source, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Source instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](/api/admin/org/apache/pulsar/client/admin/Source.html#getSourceStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Get the current status of a Pulsar sink connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sinks status options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Get the current status of **all** sink connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName/status|operation/getSinkStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** sink connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSinkInstanceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** sink connectors. + + ```java + + SinkStatus getSinkStatus(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatus`](/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SinkStatus.SinkInstanceStatus.SinkInstanceStatusData getSinkStatus(String tenant, + String namespace, + String sink, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Sink instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatusWithInstanceID`](/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Update a connector + +### `update` + +You can update a running connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Update a running Pulsar source connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sources update options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/updateSource?version=@pulsar:version_number@} + + + + +* Update a running source connector with a **local file**. + + ```java + + void updateSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#updateSource-SourceConfig-java.lang.String-). + +* Update a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sourceConfig` | The source configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + +For more information, see [`createSourceWithUrl`](/api/admin/org/apache/pulsar/client/admin/Source.html#updateSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Update a running Pulsar sink connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sinks update options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/updateSink?version=@pulsar:version_number@} + + + + +* Update a running sink connector with a **local file**. + + ```java + + void updateSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSink-SinkConfig-java.lang.String-). + +* Update a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sinkConfig` | The sink configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + |`PulsarAdminException.NotFoundException` | Cluster doesn't exist + |`PulsarAdminException` | Unexpected error + +For more information, see [`updateSinkWithUrl`](/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +## Stop a connector + +### `stop` + +You can stop a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Stop a source connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sources stop options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Stop **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/stopSource?version=@pulsar:version_number@} + +* Stop a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId|operation/stopSource?version=@pulsar:version_number@} + + + + +* Stop **all** source connectors. + + ```java + + void stopSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** source connector. + + ```java + + void stopSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Stop a sink connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sinks stop options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Stop **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName/stop|operation/stopSink?version=@pulsar:version_number@} + +* Stop a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkeName/:instanceId/stop|operation/stopSink?version=@pulsar:version_number@} + + + + +* Stop **all** sink connectors. + + ```java + + void stopSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** sink connector. + + ```java + + void stopSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Restart a connector + +### `restart` + +You can restart a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Restart a source connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sources restart options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Restart **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart **all** source connectors. + + ```java + + void restartSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** source connector. + + ```java + + void restartSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Restart a sink connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sinks restart options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +* Restart **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart all Pulsar sink connectors. + + ```java + + void restartSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Sink name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** sink connector. + + ```java + + void restartSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Sink instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Delete a connector + +### `delete` + +You can delete a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Delete a source connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sources delete options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Delete al Pulsar source connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/deregisterSource?version=@pulsar:version_number@} + + + + +Delete a source connector. + +```java + +void deleteSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`source` | Source name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](/api/admin/org/apache/pulsar/client/admin/Source.html#deleteSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Delete a sink connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sinks delete options + +``` + +For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). + + + + +Delete a sink connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/deregisterSink?version=@pulsar:version_number@} + + + + +Delete a Pulsar sink connector. + +```java + +void deleteSink(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](/api/admin/org/apache/pulsar/client/admin/Sink.html#deleteSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.10.x/performance-pulsar-perf.md b/site2/website/versioned_docs/version-2.10.x/performance-pulsar-perf.md new file mode 100644 index 0000000000000..4441d1470819f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/performance-pulsar-perf.md @@ -0,0 +1,283 @@ +--- +id: performance-pulsar-perf +title: Pulsar Perf +sidebar_label: "Pulsar Perf" +original_id: performance-pulsar-perf +--- + +The Pulsar Perf is a built-in performance test tool for Apache Pulsar. You can use the Pulsar Perf to test message writing or reading performance. For detailed information about performance tuning, see [here](https://streamnative.io/en/blog/tech/2021-01-14-pulsar-architecture-performance-tuning). + +## Produce messages + +:::tip + +For the latest and complete information about `pulsar-perf`, including commands, flags, descriptions, and more, see [`pulsar-perf`](/tools/pulsar-perf/) or [here](reference-cli-tools.md#pulsar-perf). + +::: + +- This example shows how the Pulsar Perf produces messages with **default** options. + + **Input** + + ``` + + bin/pulsar-perf produce my-topic + + ``` + + After the command is executed, the test data is continuously output on the Console. + + **Output** + + ``` + + 19:53:31.459 [pulsar-perf-producer-exec-1-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Created 1 producers + 19:53:31.482 [pulsar-timer-5-1] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider + 19:53:40.861 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 93.7 msg/s --- 0.7 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.575 ms - med: 3.460 - 95pct: 4.790 - 99pct: 5.308 - 99.9pct: 5.834 - 99.99pct: 6.609 - Max: 6.609 + 19:53:50.909 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.437 ms - med: 3.328 - 95pct: 4.656 - 99pct: 5.071 - 99.9pct: 5.519 - 99.99pct: 5.588 - Max: 5.588 + 19:54:00.926 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.376 ms - med: 3.276 - 95pct: 4.520 - 99pct: 4.939 - 99.9pct: 5.440 - 99.99pct: 5.490 - Max: 5.490 + 19:54:10.940 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.298 ms - med: 3.220 - 95pct: 4.474 - 99pct: 4.926 - 99.9pct: 5.645 - 99.99pct: 5.654 - Max: 5.654 + 19:54:20.956 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.1 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.308 ms - med: 3.199 - 95pct: 4.532 - 99pct: 4.871 - 99.9pct: 5.291 - 99.99pct: 5.323 - Max: 5.323 + 19:54:30.972 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.249 ms - med: 3.144 - 95pct: 4.437 - 99pct: 4.970 - 99.9pct: 5.329 - 99.99pct: 5.414 - Max: 5.414 + 19:54:40.987 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.435 ms - med: 3.361 - 95pct: 4.772 - 99pct: 5.150 - 99.9pct: 5.373 - 99.99pct: 5.837 - Max: 5.837 + ^C19:54:44.325 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated throughput stats --- 7286 records sent --- 99.140 msg/s --- 0.775 Mbit/s + 19:54:44.336 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated latency stats --- Latency: mean: 3.383 ms - med: 3.293 - 95pct: 4.610 - 99pct: 5.059 - 99.9pct: 5.588 - 99.99pct: 5.837 - 99.999pct: 6.609 - Max: 6.609 + + ``` + + From the above test data, you can get the throughput statistics and the write latency statistics. The aggregated statistics are printed when the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. If you specify a filename with the `--histogram-file` parameter, a file with the [HdrHistogram](http://hdrhistogram.github.io/HdrHistogram/) formatted test result appears under your directory after Pulsar Perf is stopped. You can also check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). For details about how to check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html), see [HdrHistogram Plotter](#hdrhistogram-plotter). + +- This example shows how the Pulsar Perf produces messages with `transaction` option. + + **Input** + + ```shell + + bin/pulsar-perf produce my-topic -r 10 -m 100 -txn + + ``` + + **Output** + + ```shell + + 2021-10-11T13:36:15,595+0800 INFO [Thread-3] o.a.p.t.PerformanceProducer@499 - --- Transaction : 2 transaction end successfully ---0 transaction end failed --- 0.200 Txn/s + + 2021-10-11T13:36:15,614+0800 INFO [Thread-3] o.a.p.t.PerformanceProducer@503 - Throughput produced: 100 msg --- 0.0 msg/s --- 0.1 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.067 ms - med: 3.104 - 95pct: 3.747 - 99pct: 4.619 - 99.9pct: 6.760 - 99.99pct: 6.760 - Max: 6.760 + + 2021-10-11T13:36:15,710+0800 INFO [pulsar-perf-producer-exec-46-1] o.a.p.t.PerformanceProducer@834 - Aggregated latency stats --- Latency: mean: 3.067 ms - med: 3.104 - 95pct: 3.747 - 99pct: 4.619 - 99.9pct: 6.760 - 99.99pct: 6.760 - 99.999pct: 6.760 - Max: 6.760 + + 2021-10-11T13:36:29,976+0800 INFO [Thread-4] o.a.p.t.PerformanceProducer@815 - --- Transaction : 2 transaction end successfully --- 0 transaction end failed --- 2 transaction open successfully --- 0 transaction open failed --- 12.237 Txn/s + + 2021-10-11T13:36:29,976+0800 INFO [Thread-4] o.a.p.t.PerformanceProducer@824 - Aggregated throughput stats --- 102 records sent --- 4.168 msg/s --- 0.033 Mbit/s + + ``` + +## Consume messages + +:::tip + +For the latest and complete information about `pulsar-perf`, including commands, flags, descriptions, and more, see [`pulsar-perf`](/tools/pulsar-perf/) or [here](reference-cli-tools.md#pulsar-perf). + +::: + +- This example shows how the Pulsar Perf consumes messages with **default** options. + + **Input** + + :::note + + If you have not created a topic (in this example, it is _my-topic_) before, the broker creates a new topic without partitions and messages, then the consumer can not receive any messages. Consequently, before using `pulsar-perf consume`, make sure your topic has enough messages to consume. + + ::: + + ``` + + bin/pulsar-perf consume my-topic + + ``` + + After the command is executed, the test data is continuously output on the Console. + + **Output** + + ``` + + 20:35:37.071 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Start receiving from 1 consumers on 1 topics + 20:35:41.150 [pulsar-client-io-1-9] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider + 20:35:47.092 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 59.572 msg/s -- 0.465 Mbit/s --- Latency: mean: 11.298 ms - med: 10 - 95pct: 15 - 99pct: 98 - 99.9pct: 137 - 99.99pct: 152 - Max: 152 + 20:35:57.104 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.958 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.176 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 18 - Max: 18 + 20:36:07.115 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.006 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.316 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 + 20:36:17.125 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.085 msg/s -- 0.782 Mbit/s --- Latency: mean: 9.327 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 + 20:36:27.136 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.900 msg/s -- 0.780 Mbit/s --- Latency: mean: 9.404 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 + 20:36:37.147 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.985 msg/s -- 0.781 Mbit/s --- Latency: mean: 8.998 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 + ^C20:36:42.755 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated throughput stats --- 6051 records received --- 92.125 msg/s --- 0.720 Mbit/s + 20:36:42.759 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated latency stats --- Latency: mean: 9.422 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 98 - 99.99pct: 137 - 99.999pct: 152 - Max: 152 + + ``` + + From the output test data, you can get the throughput statistics and the end-to-end latency statistics. The aggregated statistics is printed after the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. + +- This example shows how the Pulsar Perf consumes messages with `transaction` option. + + **Input** + + ```shell + + bin/pulsar-perf consume my-topic -r 10 -txn -ss mysubName -st Exclusive -sp Earliest -ntxn 10 + + ``` + + :::note + + If you have not created a topic (in this example, it is _my-topic_) before, the broker creates a new topic without partitions and messages, then the consumer can not receive any messages. Consequently, before using `pulsar-perf consume`, make sure your topic has enough messages to consume. + + ::: + + + **Output** + + ```shell + + 2021-10-11T13:43:36,052+0800 INFO [Thread-3] o.a.p.t.PerformanceConsumer@538 - --- Transaction: 6 transaction end successfully --- 0 transaction end failed --- 0.199 Txn/s --- AckRate: 9.952 msg/s + + 2021-10-11T13:43:36,065+0800 INFO [Thread-3] o.a.p.t.PerformanceConsumer@545 - Throughput received: 306 msg --- 9.952 msg/s -- 0.000 Mbit/s --- Latency: mean: 26177.380 ms - med: 26128 - 95pct: 30531 - 99pct: 30923 - 99.9pct: 31021 - 99.99pct: 31021 - Max: 31021 + + 2021-10-11T13:43:59,854+0800 INFO [Thread-5] o.a.p.t.PerformanceConsumer@579 - -- Transaction: 10 transaction end successfully --- 0 transaction end failed --- 10 transaction open successfully --- 0 transaction open failed --- 0.185 Txn/s + + 2021-10-11T13:43:59,854+0800 INFO [Thread-5] o.a.p.t.PerformanceConsumer@588 - Aggregated throughput stats --- 505 records received --- 9.345 msg/s --- 0.000 Mbit/s--- AckRate: 9.27065308842743 msg/s --- ack failed 4 msg + + 2021-10-11T13:43:59,882+0800 INFO [Thread-5] o.a.p.t.PerformanceConsumer@601 - Aggregated latency stats --- Latency: mean: 50593.000 ms - med: 50593 - 95pct: 50593 - 99pct: 50593 - 99.9pct: 50593 - 99.99pct: 50593 - 99.999pct: 50593 - Max: 50593 + + ``` + +## Transactions + +This section shows how Pulsar Perf runs transactions. For more information, see [Pulsar transactions](txn-why.md). + +### Use transaction + +This example executes 50 transactions. Each transaction sends and receives 1 message (default). + +**Input** + +```shell + +bin/pulsar-perf transaction --topics-c myConsumerTopic --topics-p MyproduceTopic -threads 1 -ntxn 50 -ss testSub -nmp 1 -nmc 1 + +``` + +:::note + +If you have not created a topic (in this example, it is _myConsumerTopic_) before, the broker creates a new topic without partitions and messages, then the consumer can not receive any messages. Consequently, before using `pulsar-perf transaction`, make sure your topic has enough messages to consume. + +::: + +**Output** + +```shell + +2021-10-11T14:37:27,863+0800 INFO [Thread-5] o.a.p.t.PerformanceProducer@613 - Messages ack aggregated latency stats --- Latency: mean: 29.239 ms - med: 26.799 - 95pct: 46.696 - 99pct: 55.660 - 99.9pct: 55.660 - 99.99pct: 55.660 - 99.999pct: 55.660 - Max: 55.660 {} + +2021-10-11T14:37:19,391+0800 INFO [Thread-4] o.a.p.t.PerformanceProducer@525 - Throughput transaction: 50 transaction executes --- 4.999 transaction/s ---send Latency: mean: 31.368 ms - med: 28.369 - 95pct: 55.631 - 99pct: 57.764 - 99.9pct: 57.764 - 99.99pct: 57.764 - Max: 57.764---ack Latency: mean: 29.239 ms - med: 26.799 - 95pct: 46.696 - 99pct: 55.660 - 99.9pct: 55.660 - 99.99pct: 55.660 - Max: 55.660 {} + +2021-10-11T14:37:26,625+0800 INFO [Thread-5] o.a.p.t.PerformanceProducer@571 - Aggregated throughput stats --- 50 transaction executed --- 2.718 transaction/s --- 50 transaction open successfully --- 0 transaction open failed --- 50 transaction end successfully --- 0 transaction end failed--- 0 message ack failed --- 0 message send failed--- 50 message ack success --- 50 message send success {} + +``` + +### Disable Transaction + +This example disables transactions. + +**Input** + +```shell + +bin/pulsar-perf transaction --topics-c myConsumerTopic --topics-p myproduceTopic -threads 1 -ntxn 50 -ss testSub --txn-disEnable + +``` + +:::note + +If you have not created a topic (in this example, it is _myConsumerTopic_) before, the broker creates a new topic without partitions and messages, then the consumer can not receive any messages. Consequently, before using `pulsar-perf transaction --txn-disEnable`, make sure your topic has enough messages to consume. + +::: + +**Output** + +```shell + +2021-10-11T16:48:26,876+0800 INFO [Thread-4] o.a.p.t.PerformanceProducer@529 - Throughput task: 50 task executes --- 4.999 task/s ---send Latency: mean: 10.002 ms - med: 9.875 - 95pct: 11.733 - 99pct: 15.995 - 99.9pct: 15.995 - 99.99pct: 15.995 - Max: 15.995---ack Latency: mean: 0.051 ms - med: 0.020 - 95pct: 0.059 - 99pct: 1.377 - 99.9pct: 1.377 - 99.99pct: 1.377 - Max: 1.377 + +2021-10-11T16:48:29,222+0800 INFO [Thread-5] o.a.p.t.PerformanceProducer@617 - Messages ack aggregated latency stats --- Latency: mean: 0.051 ms - med: 0.020 - 95pct: 0.059 - 99pct: 1.377 - 99.9pct: 1.377 - 99.99pct: 1.377 - 99.999pct: 1.377 - Max: 1.377 + +2021-10-11T16:48:29,246+0800 INFO [Thread-5] o.a.p.t.PerformanceProducer@629 - Messages send aggregated latency stats --- Latency: mean: 10.002 ms - med: 9.875 - 95pct: 11.733 - 99pct: 15.995 - 99.9pct: 15.995 - 99.99pct: 15.995 - 99.999pct: 15.995 - Max: 15.995 + +2021-10-11T16:48:29,117+0800 INFO [Thread-5] o.a.p.t.PerformanceProducer@602 - Aggregated throughput stats --- 50 task executed --- 4.025 task/s --- 0 message ack failed --- 0 message send failed--- 50 message ack success --- 50 message send success + +``` + +## Configurations + +By default, the Pulsar Perf uses `conf/client.conf` as the default configuration and uses `conf/log4j2.yaml` as the default Log4j configuration. If you want to connect to other Pulsar clusters, you can update the `brokerServiceUrl` in the client configuration. + +You can use the following commands to change the configuration file and the Log4j configuration file. + +``` + +export PULSAR_CLIENT_CONF= +export PULSAR_LOG_CONF= + +``` + +In addition, you can use the following command to configure the JVM configuration through environment variables: + +``` + +export PULSAR_EXTRA_OPTS='-Xms4g -Xmx4g -XX:MaxDirectMemorySize=4g' + +``` + +## HdrHistogram Plotter + +The [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html) is a visualization tool for checking Pulsar Perf test results, which makes it easier to observe the test results. + +To check test results through the HdrHistogram Plotter, follow these steps: + +1. Clone the HdrHistogram repository from GitHub to the local. + + ``` + + git clone https://github.com/HdrHistogram/HdrHistogram.git + + ``` + +2. Switch to the HdrHistogram folder. + + ``` + + cd HdrHistogram + + ``` + +3. Install the HdrHistogram Plotter. + + ``` + + mvn clean install -DskipTests + + ``` + +4. Transform the file generated by the Pulsar Perf. + + ``` + + ./HistogramLogProcessor -i -o + + ``` + +5. You will get two output files. Upload the output file with the filename extension of .hgrm to the [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). + +6. Check the test result through the Graphical User Interface of the HdrHistogram Plotter, as shown blow. + + ![](/assets/perf-produce.png) diff --git a/site2/website/versioned_docs/version-2.10.x/reference-cli-tools.md b/site2/website/versioned_docs/version-2.10.x/reference-cli-tools.md new file mode 100644 index 0000000000000..1e426501e23a3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-cli-tools.md @@ -0,0 +1,1039 @@ +--- +id: reference-cli-tools +title: Pulsar command-line tools +sidebar_label: "Pulsar CLI tools" +original_id: reference-cli-tools +--- + +Pulsar offers several command-line tools that you can use for managing Pulsar installations, performance testing, using command-line producers and consumers, and more. + +All Pulsar command-line tools can be run from the `bin` directory of your [installed Pulsar package](getting-started-standalone.md). The following tools are currently documented: + +* [`pulsar`](#pulsar) +* [`pulsar-client`](#pulsar-client) +* [`pulsar-daemon`](#pulsar-daemon) +* [`pulsar-perf`](#pulsar-perf) +* [`bookkeeper`](#bookkeeper) +* [`broker-tool`](#broker-tool) + +> **Important** +> +> - This page only shows **some frequently used commands**. For the latest information about `pulsar`, `pulsar-client`, and `pulsar-perf`, including commands, flags, descriptions, and more information, see [Pulsar tools](/tools/). +> +> - You can get help for any CLI tool, command, or subcommand using the `--help` flag, or `-h` for short. Here's an example: +> + +> ```shell +> +> $ bin/pulsar broker --help +> +> +> ``` + + +## `pulsar` + +The pulsar tool is used to start Pulsar components, such as bookies and ZooKeeper, in the foreground. + +These processes can also be started in the background, using nohup, using the pulsar-daemon tool, which has the same command interface as pulsar. + +Usage: + +```bash + +$ pulsar command + +``` + +Commands: +* `bookie` +* `broker` +* `compact-topic` +* `configuration-store` +* `initialize-cluster-metadata` +* `proxy` +* `standalone` +* `websocket` +* `zookeeper` +* `zookeeper-shell` +* `autorecovery` + +Example: + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +The table below lists the environment variables that you can use to configure the `pulsar` tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|`conf/log4j2.yaml`| +|`PULSAR_BROKER_CONF`|Configuration file for broker|`conf/broker.conf`| +|`PULSAR_BOOKKEEPER_CONF`|description: Configuration file for bookie|`conf/bookkeeper.conf`| +|`PULSAR_ZK_CONF`|Configuration file for zookeeper|`conf/zookeeper.conf`| +|`PULSAR_CONFIGURATION_STORE_CONF`|Configuration file for the configuration store|`conf/global_zookeeper.conf`| +|`PULSAR_WEBSOCKET_CONF`|Configuration file for websocket proxy|`conf/websocket.conf`| +|`PULSAR_STANDALONE_CONF`|Configuration file for standalone|`conf/standalone.conf`| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the jvm|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_PID_DIR`|Folder where the pulsar server PID file should be stored|| +|`PULSAR_STOP_TIMEOUT`|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| +|`PULSAR_GC_LOG`|Gc options to be passed to the jvm|| + + +### `bookie` + +Starts up a bookie server + +Usage: + +```bash + +$ pulsar bookie options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-readOnly`|Force start a read-only bookie server|false| +|`-withAutoRecovery`|Start auto-recover service bookie server|false| + + +Example + +```bash + +$ PULSAR_BOOKKEEPER_CONF=/path/to/bookkeeper.conf pulsar bookie \ + -readOnly \ + -withAutoRecovery + +``` + +### `broker` + +Starts up a Pulsar broker + +Usage + +```bash + +$ pulsar broker options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-bc` , `--bookie-conf`|Configuration file for BookKeeper|| +|`-rb` , `--run-bookie`|Run a BookKeeper bookie on the same host as the Pulsar broker|false| +|`-ra` , `--run-bookie-autorecovery`|Run a BookKeeper autorecovery daemon on the same host as the Pulsar broker|false| + +Example + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +### `compact-topic` + +Run compaction against a Pulsar topic (in a new process) + +Usage + +```bash + +$ pulsar compact-topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t` , `--topic`|The Pulsar topic that you would like to compact|| + +Example + +```bash + +$ pulsar compact-topic --topic topic-to-compact + +``` + +### `configuration-store` + +Starts up the Pulsar configuration store + +Usage + +```bash + +$ pulsar configuration-store + +``` + +Example + +```bash + +$ PULSAR_CONFIGURATION_STORE_CONF=/path/to/configuration_store.conf pulsar configuration-store + +``` + +### `initialize-cluster-metadata` + +One-time cluster metadata initialization + +Usage + +```bash + +$ pulsar initialize-cluster-metadata options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-ub` , `--broker-service-url`|The broker service URL for the new cluster|| +|`-tb` , `--broker-service-url-tls`|The broker service URL for the new cluster with TLS encryption|| +|`-c` , `--cluster`|Cluster name|| +|`-cms` , `--configuration-metadata-store`|The configuration metadata store quorum connection string|| +|`--existing-bk-metadata-service-uri`|The metadata service URI of the existing BookKeeper cluster that you want to use|| +|`-h` , `--help`|Help message|false| +|`--initial-num-stream-storage-containers`|The number of storage containers of BookKeeper stream storage|16| +|`--initial-num-transaction-coordinators`|The number of transaction coordinators assigned in a cluster|16| +|`-uw` , `--web-service-url`|The web service URL for the new cluster|| +|`-tw` , `--web-service-url-tls`|The web service URL for the new cluster with TLS encryption|| +|`-md` , `--metadata-store`|The metadata store service url|| +|`--zookeeper-session-timeout-ms`|The local ZooKeeper session timeout. The time unit is in millisecond(ms)|30000| + + +### `proxy` + +Manages the Pulsar proxy + +Usage + +```bash + +$ pulsar proxy options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-cms`, `--configuration-metadata-store`|Configuration metadata store connection string|| +|`-md` , `--metadata-store`|Metadata Store service url|| + +Example + +```bash + +$ PULSAR_PROXY_CONF=/path/to/proxy.conf pulsar proxy \ + --metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 \ + --configuration-metadata-store zk:my-zk-1:2181,my-zk-2:2181,my-zk-3:2181 + +``` + +### `standalone` + +Run a broker service with local bookies and local ZooKeeper + +Usage + +```bash + +$ pulsar standalone options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a` , `--advertised-address`|The standalone broker advertised address|| +|`--bookkeeper-dir`|Local bookies’ base data directory|data/standalone/bookkeeper| +|`--bookkeeper-port`|Local bookies’ base port|3181| +|`--no-broker`|Only start ZooKeeper and BookKeeper services, not the broker|false| +|`--num-bookies`|The number of local bookies|1| +|`--only-broker`|Only start the Pulsar broker service (not ZooKeeper or BookKeeper)|| +|`--wipe-data`|Clean up previous ZooKeeper/BookKeeper data|| +|`--zookeeper-dir`|Local ZooKeeper’s data directory|data/standalone/zookeeper| +|`--zookeeper-port` |Local ZooKeeper’s port|2181| + +Example + +```bash + +$ PULSAR_STANDALONE_CONF=/path/to/standalone.conf pulsar standalone + +``` + +### `websocket` + +Usage + +```bash + +$ pulsar websocket + +``` + +Example + +```bash + +$ PULSAR_WEBSOCKET_CONF=/path/to/websocket.conf pulsar websocket + +``` + +### `zookeeper` + +Starts up a ZooKeeper cluster + +Usage + +```bash + +$ pulsar zookeeper + +``` + +Example + +```bash + +$ PULSAR_ZK_CONF=/path/to/zookeeper.conf pulsar zookeeper + +``` + +### `zookeeper-shell` + +Connects to a running ZooKeeper cluster using the ZooKeeper shell + +Usage + +```bash + +$ pulsar zookeeper-shell options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| +|`-server`|Configuration zk address, eg: `127.0.0.1:2181`|| + +### `autorecovery` + +Runs an auto-recovery service. + +Usage + +```bash + +$ pulsar autorecovery options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the autorecovery|N/A| + + +## `pulsar-client` + +The pulsar-client tool + +Usage + +```bash + +$ pulsar-client command + +``` + +Commands +* `produce` +* `consume` + + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{\"key1\":\"val1\",\"key2\":\"val2\"}"|{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}| +|`--auth-plugin`|Authentication plugin class name|org.apache.pulsar.client.impl.auth.AuthenticationSasl| +|`--listener-name`|Listener name for the broker|| +|`--proxy-protocol`|Proxy protocol to select type of routing at proxy|| +|`--proxy-url`|Proxy-server URL to which to connect|| +|`--url`|Broker URL to which to connect|pulsar://localhost:6650/
    ws://localhost:8080 | +| `-v`, `--version` | Get the version of the Pulsar client +|`-h`, `--help`|Show this help + + +### `produce` +Send a message or messages to a specific broker and topic + +Usage + +```bash + +$ pulsar-client produce topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-f`, `--files`|Comma-separated file paths to send; either -m or -f must be specified|[]| +|`-m`, `--messages`|Comma-separated string of messages to send; either -m or -f must be specified|[]| +|`-n`, `--num-produce`|The number of times to send the message(s); the count of messages/files * num-produce should be below 1000|1| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| +|`-db`, `--disable-batching`|Disable batch sending of messages|false| +|`-c`, `--chunking`|Split the message and publish in chunks if the message size is larger than the allowed max size|false| +|`-s`, `--separator`|Character to split messages string with.|","| +|`-k`, `--key`|Message key to add|key=value string, like k1=v1,k2=v2.| +|`-p`, `--properties`|Properties to add. If you want to add multiple properties, use the comma as the separator, e.g. `k1=v1,k2=v2`.| | +|`-ekn`, `--encryption-key-name`|The public key name to encrypt payload.| | +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | + + +### `consume` +Consume messages from a specific broker and topic + +Usage + +```bash + +$ pulsar-client consume topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--hex`|Display binary messages in hexadecimal format.|false| +|`-n`, `--num-messages`|Number of messages to consume, 0 means to consume forever.|1| +|`-r`, `--rate`|Rate (in messages per second) at which to consume; a value 0 means to consume messages as fast as possible|0.0| +|`--regex`|Indicate the topic name is a regex pattern|false| +|`-s`, `--subscription-name`|Subscription name|| +|`-t`, `--subscription-type`|The type of the subscription. Possible values: Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-p`, `--subscription-position`|The position of the subscription. Possible values: Latest, Earliest.|Latest| +|`-m`, `--subscription-mode`|Subscription mode. Possible values: Durable, NonDurable.|Durable| +|`-q`, `--queue-size`|The size of consumer's receiver queue.|0| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages.|0| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full.|false| +|`--hide-content`|Do not print the message to the console.|false| +|`-st`, `--schema-type`|Set the schema type. Use `auto_consume` to dump AVRO and other structured data types. Possible values: bytes, auto_consume.|bytes| +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | +|`-pm`, `--pool-messages`|Use the pooled message.|true| + +## `pulsar-daemon` +A wrapper around the pulsar tool that’s used to start and stop processes, such as ZooKeeper, bookies, and Pulsar brokers, in the background using nohup. + +pulsar-daemon has a similar interface to the pulsar command but adds start and stop commands for various services. For a listing of those services, run pulsar-daemon to see the help output or see the documentation for the pulsar command. + +Usage + +```bash + +$ pulsar-daemon command + +``` + +Commands +* `start` +* `stop` +* `restart` + + +### `start` +Start a service in the background using nohup. + +Usage + +```bash + +$ pulsar-daemon start service + +``` + +### `stop` +Stop a service that’s already been started using start. + +Usage + +```bash + +$ pulsar-daemon stop service options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|-force|Stop the service forcefully if not stopped by normal shutdown.|false| + +### `restart` +Restart a service that has already been started. + +```bash + +$ pulsar-daemon restart service + +``` + +## `pulsar-perf` +A tool for performance testing a Pulsar broker. + +Usage + +```bash + +$ pulsar-perf command + +``` + +Commands +* `consume` +* `produce` +* `read` +* `websocket-producer` +* `managed-ledger` +* `monitor-brokers` +* `simulation-client` +* `simulation-controller` +* `transaction` +* `help` + +Environment variables + +The table below lists the environment variables that you can use to configure the pulsar-perf tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|conf/log4j2.yaml| +|`PULSAR_CLIENT_CONF`|Configuration file for the client|conf/client.conf| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the JVM|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_GC_LOG`|Gc options to be passed to the jvm|| + + +### `consume` +Run a consumer + +Usage + +``` + +$ pulsar-perf consume options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full|false| +|`--listener-name`|Listener name for the broker|| +|`--acks-delay-millis`|Acknowledgements grouping delay in millis|100| +|`--batch-index-ack`|Enable or disable the batch index acknowledgment|false| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-v`, `--encryption-key-value-file`|The file which contains the private key to decrypt payload|| +|`-h`, `--help`|Help message|false| +|`-cf`, `--conf-file`|Configuration file|| +|`-m`, `--num-messages`|Number of messages to consume in total. If the value is equal to or smaller than 0, it keeps consuming messages.|0| +|`-e`, `--expire_time_incomplete_chunked_messages`|The expiration time for incomplete chunk messages (in milliseconds)|0| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages|0| +|`-n`, `--num-consumers`|Number of consumers (per topic)|1| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-lt`, `--num-listener-threads`|Set the number of threads to be used for message listeners|1| +|`-ns`, `--num-subscriptions`|Number of subscriptions (per topic)|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-pm`, `--pool-messages`|Use the pooled message|true| +|`-r`, `--rate`|Simulate a slow message consumer (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-p`, `--receiver-queue-size-across-partitions`|Max total size of the receiver queue across partitions|50000| +|`--replicated`|Whether the subscription status should be replicated|false| +|`-u`, `--service-url`|Pulsar service URL|| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled|0| +|`-s`, `--subscriber-name`|Subscriber name prefix|| +|`-ss`, `--subscriptions`|A list of subscriptions to consume on (e.g. sub1,sub2)|sub| +|`-st`, `--subscription-type`|Subscriber type. Possible values are Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-sp`, `--subscription-position`|Subscriber position. Possible values are Latest, Earliest.|Latest| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps consuming messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + +Below are **transaction** related options. + +If you want `--txn-timeout`, `--numMessage-perTransaction`, `-nmt`, `-ntxn`, or `-abort` take effect, set `--txn-enable` to true. + +|Flag|Description|Default| +|---|---|---| +`-tto`, `--txn-timeout`|Set the time of transaction timeout (in second). |10 +`-nmt`, `--numMessage-perTransaction`|The number of messages acknowledged by a transaction. |50 +`-txn`, `--txn-enable`|Enable or disable a transaction.|false +`-ntxn`|The number of opened transactions. 0 means the number of transactions is unlimited. |0 +`-abort`|Abort a transaction. |true + +### `produce` +Run a producer + +Usage + +```bash + +$ pulsar-perf produce options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-am`, `--access-mode`|Producer access mode. Valid values are `Shared`, `Exclusive` and `WaitForExclusive`|Shared| +|`-au`, `--admin-url`|Pulsar admin URL|| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-bb`, `--batch-max-bytes`|Maximum number of bytes per batch|4194304| +|`-bm`, `--batch-max-messages`|Maximum number of messages per batch|1000| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-ch`, `--chunking`|Split the message and publish in chunks if the message size is larger than allowed max size|false| +|`-d`, `--delay`|Mark messages with a given delay in seconds|0s| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, ZLIB, ZSTD or SNAPPY.|| +|`-cf`, `--conf-file`|Configuration file|| +|`-k`, `--encryption-key-name`|The public key name to encrypt payload|| +|`-v`, `--encryption-key-value-file`|The file which contains the public key to encrypt payload|| +|`-ef`, `--exit-on-failure`|Exit from the process on publish failure|false| +|`-fc`, `--format-class`|Custom Formatter class name|org.apache.pulsar.testclient.DefaultMessageFormatter| +|`-fp`, `--format-payload`|Format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds|false| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-p`, `--max-outstanding-across-partitions`|Max number of outstanding messages across partitions|50000| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-mk`, `--message-key-generation-mode`|The generation mode of message key. Valid options are `autoIncrement`, `random`|| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-threads`, `--num-test-threads`|Number of test threads|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-np`, `--partitions`|Create partitioned topics with the given number of partitions. Setting this value to 0 means not trying to create a topic|| +|`-f`, `--payload-file`|Use payload from an UTF-8 encoded text file and a payload will be randomly selected when publishing messages|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-pn`, `--producer-name`|Producer Name|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`--send-timeout`|Set the sendTimeout|0| +|`--separator`|Separator between the topic and topic number|-| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--warmup-time`|Warm-up time in seconds|1| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + +Below are **transaction** related options. + +If you want `--txn-timeout`, `--numMessage-perTransaction`, or `-abort` take effect, set `--txn-enable` to true. + +|Flag|Description|Default| +|---|---|---| +`-tto`, `--txn-timeout`|Set the time of transaction timeout (in second). |5 +`-nmt`, `--numMessage-perTransaction`|The number of messages acknowledged by a transaction. |50 +`-txn`, `--txn-enable`|Enable or disable a transaction.|true +`-abort`|Abort a transaction. |true + +### `read` +Run a topic reader + +Usage + +```bash + +$ pulsar-perf read options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`-cf`, `--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-n`, `--num-messages`|Number of messages to consume in total. If the value is equal to or smaller than 0, it keeps consuming messages.|0| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-lt`, `--num-listener-threads`|Set the number of threads to be used for message listeners|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-r`, `--rate`|Simulate a slow message reader (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-u`, `--service-url`|Pulsar service URL|| +|`-m`, `--start-message-id`|Start message id. This can be either 'earliest', 'latest' or a specific message id by using 'lid:eid'|earliest| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps consuming messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--use-tls`|Use TLS encryption on the connection|false| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + +### `websocket-producer` +Run a websocket producer + +Usage + +```bash + +$ pulsar-perf websocket-producer options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`-cf`, `--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of empty buffer|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-fp`, `--format-payload`|Format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds|false| +|`-fc`, `--format-class`|Custom formatter class name|`org.apache.pulsar.testclient.DefaultMessageFormatter`| +|`-u`, `--proxy-url`|Pulsar Proxy URL, e.g., "ws://localhost:8080/"|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| + + +### `managed-ledger` +Write directly on managed-ledgers + +Usage + +```bash + +$ pulsar-perf managed-ledger options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a`, `--ack-quorum`|Ledger ack quorum|1| +|`-dt`, `--digest-type`|BookKeeper digest type. Possible Values: [CRC32, MAC, CRC32C, DUMMY]|CRC32C| +|`-e`, `--ensemble-size`|Ledger ensemble size|1| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single bookie|1| +|`-o`, `--max-outstanding`|Max number of outstanding requests|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-t`, `--num-topic`|Number of managed ledgers|1| +|`-r`, `--rate`|Write rate msg/s across managed ledgers|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| +|`--threads`|Number of threads writing|1| +|`-w`, `--write-quorum`|Ledger write quorum|1| +|`-md`, `--metadata-store`|Metadata store service URL. For example: zk:my-zk:2181|| + + +### `monitor-brokers` +Continuously receive broker data and/or load reports + +Usage + +```bash + +$ pulsar-perf monitor-brokers options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--connect-string`|A connection string for one or more ZooKeeper servers|| +|`-h`, `--help`|Help message|false| + + +### `simulation-client` +Run a simulation server acting as a Pulsar client. Uses the client configuration specified in `conf/client.conf`. + +Usage + +```bash + +$ pulsar-perf simulation-client options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--port`|Port to listen on for controller|0| +|`--service-url`|Pulsar Service URL|| +|`-h`, `--help`|Help message|false| + +### `simulation-controller` +Run a simulation controller to give commands to servers + +Usage + +```bash + +$ pulsar-perf simulation-controller options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--client-port`|The port that the clients are listening on|0| +|`--clients`|Comma-separated list of client hostnames|| +|`--cluster`|The cluster to test on|| +|`-h`, `--help`|Help message|false| + +### `transaction` + +Run a transaction. For more information, see [Pulsar transactions](txn-why.md). + +**Usage** + +```bash + +$ pulsar-perf transaction options + +``` + +**Options** + +|Flag|Description|Default| +|---|---|---| +`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|N/A +`--auth-plugin`|Authentication plugin class name.|N/A +`-au`, `--admin-url`|Pulsar admin URL.|N/A +`-cf`, `--conf-file`|Configuration file.|N/A +`-h`, `--help`|Help messages.|N/A +`-c`, `--max-connections`|Maximum number of TCP connections to a single broker.|100 +`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers. |1 +`-ns`, `--num-subscriptions`|Number of subscriptions per topic.|1 +`-threads`, `--num-test-threads`|Number of test threads.

    This thread is for a new transaction to ack messages from consumer topics, produce messages to producer topics, and commit or abort this transaction.

    Increasing the number of threads increases the parallelism of the performance test, consequently, it increases the intensity of the stress test.|1 +`-nmc`, `--numMessage-perTransaction-consume`|Set the number of messages consumed in a transaction.

    If transaction is disabled, it means the number of messages consumed in a task instead of in a transaction.|1 +`-nmp`, `--numMessage-perTransaction-produce`|Set the number of messages produced in a transaction.

    If transaction is disabled, it means the number of messages produced in a task instead of in a transaction.|1 +`-ntxn`, `--number-txn`|Set the number of transactions.

    0 means the number of transactions is unlimited.

    If transaction is disabled, it means the number of tasks instead of transactions. |0 +`-np`, `--partitions`|Create partitioned topics with a given number of partitions.

    0 means not trying to create a topic. +`-q`, `--receiver-queue-size`|Size of the receiver queue.|1000 +`-u`, `--service-url`|Pulsar service URL.|N/A +`-sp`, `--subscription-position`|Subscription position.|Earliest +`-st`, `--subscription-type`|Subscription type.|Shared +`-ss`, `--subscriptions`|A list of subscriptions to consume.

    For example, sub1,sub2.|[sub] +`-time`, `--test-duration`|Test duration (in second).

    0 means keeping publishing messages.|0 +`--topics-c`|All topics assigned to consumers.|[test-consume] +`--topics-p`|All topics assigned to producers . |[test-produce] +`--txn-disEnable`|Disable transaction.|true +`-tto`, `--txn-timeout`|Set the time of transaction timeout (in second).

    If you want `--txn-timeout` takes effect, set `--txn-enable` to true.|5 +`-abort`|Abort the transaction.

    If you want `-abort` takes effect, set `--txn-disEnable` to false.|true +`-txnRate`|Set the rate of opened transactions or tasks.

    0 means no limit.|0 + +### `help` +This help message + +Usage + +```bash + +$ pulsar-perf help + +``` + +## `bookkeeper` +A tool for managing BookKeeper. + +Usage + +```bash + +$ bookkeeper command + +``` + +Commands +* `autorecovery` +* `bookie` +* `localbookie` +* `upgrade` +* `shell` + + +Environment variables + +The table below lists the environment variables that you can use to configure the bookkeeper tool. + +|Variable|Description|Default| +|---|---|---| +|BOOKIE_LOG_CONF|Log4j configuration file|conf/log4j2.yaml| +|BOOKIE_CONF|BookKeeper configuration file|conf/bk_server.conf| +|BOOKIE_EXTRA_OPTS|Extra options to be passed to the JVM|| +|BOOKIE_EXTRA_CLASSPATH|Extra paths for BookKeeper's classpath|| +|ENTRY_FORMATTER_CLASS|The Java class used to format entries|| +|BOOKIE_PID_DIR|Folder where the BookKeeper server PID file should be stored|| +|BOOKIE_STOP_TIMEOUT|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| +|BOOKIE_GC_LOG|Gc options to be passed to the jvm|| + + +### `autorecovery` +Runs an auto-recovery service + +Usage + +```bash + +$ bookkeeper autorecovery options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| + + +### `bookie` +Starts up a BookKeeper server (aka bookie) + +Usage + +```bash + +$ bookkeeper bookie options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| +|-readOnly|Force start a read-only bookie server|false| +|-withAutoRecovery|Start auto-recovery service bookie server|false| + + +### `localbookie` +Runs a test ensemble of N bookies locally + +Usage + +```bash + +$ bookkeeper localbookie N + +``` + +### `upgrade` +Upgrade the bookie’s filesystem + +Usage + +```bash + +$ bookkeeper upgrade options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| +|`-u`, `--upgrade`|Upgrade the bookie’s directories|| + + +### `shell` +Run shell for admin commands. To see a full listing of those commands, run bookkeeper shell without an argument. + +Usage + +```bash + +$ bookkeeper shell + +``` + +Example + +```bash + +$ bookkeeper shell bookiesanity + +``` + +## `broker-tool` + +The `broker- tool` is used for operations on a specific broker. + +Usage + +```bash + +$ broker-tool command + +``` + +Commands +* `load-report` +* `help` + +Example +Two ways to get more information about a command as below: + +```bash + +$ broker-tool help command +$ broker-tool command --help + +``` + +### `load-report` + +Collect the load report of a specific broker. +The command is run on a broker, and used for troubleshooting why broker can’t collect right load report. + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--interval`| Interval to collect load report, in milliseconds || +|`-h`, `--help`| Display help information || + diff --git a/site2/website/versioned_docs/version-2.10.x/reference-configuration.md b/site2/website/versioned_docs/version-2.10.x/reference-configuration.md new file mode 100644 index 0000000000000..142d08c73dcf3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-configuration.md @@ -0,0 +1,893 @@ +--- +id: reference-configuration +title: Pulsar configuration +sidebar_label: "Pulsar configuration" +original_id: reference-configuration +--- + + + + +You can manage Pulsar configuration by configuration files in the [`conf`](https://github.com/apache/pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md). + +- [BookKeeper](#bookkeeper) +- [Broker](#broker) +- [Client](#client) +- [Log4j](#log4j) +- [Log4j shell](#log4j-shell) +- [Standalone](#standalone) +- [WebSocket](#websocket) +- [Pulsar proxy](#pulsar-proxy) +- [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (that is the interface used to establish its identity). By default, loopback interfaces are not allowed to work as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. By default, the bookie listens on all interfaces.|eth0| +|advertisedAddress|Configure a specific hostname or IP address that the bookie should use to advertise itself to clients. By default, the bookie advertises either its own IP address or hostname according to the `listeningInterface` and `useHostNameAsBookieID` settings.|N/A| +|allowMultipleDirsUnderSameDiskPartition|Configure the bookie to enable/disable multiple ledger/index/journal directories in the same filesystem disk partition.|false| +|minUsableSizeForIndexFileCreation|The minimum safe usable size available in index directory for bookie to create index files while replaying journal at the time of bookie starts in Readonly Mode (in bytes).|1073741824| +|journalDirectory|The directory where BookKeeper outputs its write-ahead log (WAL).|data/bookkeeper/journal| +|journalDirectories|Directories that BookKeeper outputs its write ahead log. Multiple directories are available, being separated by `,`. For example: `journalDirectories=/tmp/bk-journal1,/tmp/bk-journal2`. If `journalDirectories` is set, the bookies skip `journalDirectory` and use this setting directory.|/tmp/bk-journal| +|ledgerDirectories|The directory where BookKeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by `,`, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled. Note: should be greater than gcWaitTime. |3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled. Note: should be greater than gcWaitTime. |86400| +|readOnlyModeEnabled|If `readOnlyModeEnabled=true`, then on all full ledger disks, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown.|true| +|forceReadOnlyBookie|Whether the bookie is force started in read only mode.|false| +|persistBookieStatusEnabled|Persist the bookie status locally on the disks. So the bookies can keep their status upon restarts.|false| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction reads entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal files to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|openLedgerRereplicationGracePeriod | The grace period, in milliseconds, that the replication worker waits before fencing and replicating a ledger fragment that's still being written to upon bookie failure. | 30000 | +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|100| +|autoRecoveryDaemonEnabled|Whether the bookie itself can start auto-recovery service.|true| +|lostBookieRecoveryDelay|How long to wait, in seconds, before starting auto recovery of a lost bookie.|0| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|allowStorageExpansion|Allow the bookie storage to expand. Newly added ledger and index dirs must be empty.|false| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|zkRetryBackoffStartMs|The start time that the Zookeeper client backoff retries in milliseconds.|1000| +|zkRetryBackoffMaxMs|The maximum time that the Zookeeper client backoff retries in milliseconds.|10000| +|zkEnableSecurity|Set ACLs on every node written on ZooKeeper, allowing users to read and write BookKeeper metadata stored on ZooKeeper. In order to make ACLs work you need to setup ZooKeeper JAAS authentication. All the bookies and Client need to share the same user, and this is usually done using Kerberos authentication. See ZooKeeper documentation.|false| +|httpServerEnabled|The flag enables/disables starting the admin http server.|false| +|httpServerPort|The HTTP server port to listen on. By default, the value is `8080`. If you want to keep it consistent with the Prometheus stats provider, you can set it to `8000`.|8080 +|httpServerClass|The http server class.|org.apache.bookkeeper.http.vertx.VertxHttpServer| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|serverSockKeepalive|This setting is used to send keep-alive messages on connection-oriented sockets.|true| +|serverTcpLinger|The socket linger timeout on close. When enabled, a close or shutdown will not return until all queued messages for the socket have been successfully sent or the linger timeout has been reached. Otherwise, the call returns immediately and the closing is done in the background.|0| +|byteBufAllocatorSizeMax|The maximum buf size of the received ByteBuf allocator.|1048576| +|nettyMaxFrameSizeBytes|The maximum netty frame size in bytes. Any message received larger than this will be rejected.|5253120| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficient when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain better performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partitions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|sortedLedgerStorageEnabled|Whether sorted-ledger storage is enabled.|true| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|The number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|The number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|numHighPriorityWorkerThreads|The umber of threads that should be used for high priority requests (i.e. recovery reads and adds, and fencing).|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|maxPendingAddRequestsPerThread|The limited number of pending requests, which is used to avoid the executor queue to grow indefinitely when add workers threads are enabled.|10000| +|isForceGCAllowWhenNoSpace|Whether force compaction is allowed when the disk is full or almost full. Forcing GC could get some space back, but could also fill up the disk space more quickly. This is because new log files are created before GC, while old garbage log files are deleted after GC.|false| +|verifyMetadataOnGC|True if the bookie should double check `readMetadata` prior to GC.|false| +|flushEntrylogBytes|Entry log flush interval in bytes. Flushing in smaller chunks but more frequently reduces spikes in disk I/O. Flushing too frequently may also affect performance negatively.|268435456| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ip address for the registration.|false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for the bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`). If you set the `bookieId`, then the `useHostNameAsBookieID` does not take effect.

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|N/A| +|allowEphemeralPorts|Whether the bookie is allowed to use an ephemeral port (port 0) as its server port. By default, an ephemeral port is not allowed. Using an ephemeral port as the service port usually indicates a configuration error. However, in unit tests, using an ephemeral port will address port conflict problems and allow running tests in parallel.|false| +|enableLocalTransport|Whether the bookie is allowed to listen for the BookKeeper clients executed on the local JVM.|false| +|disableServerSocketBind|Whether the bookie is allowed to disable bind on network interfaces. This bookie will be available only to BookKeeper clients executed on the local JVM.|false| +|skipListArenaChunkSize|The number of bytes that we should use as chunk allocation for `org.apache.bookkeeper.bookie.SkipListArena`.|4194304| +|skipListArenaMaxAllocSize|The maximum size that we should allocate from the skiplist arena. Allocations larger than this should be allocated directly by the VM to avoid fragmentation.|131072| +|bookieAuthProviderFactoryClass|The factory class name of the bookie authentication provider. If this is null, then there is no authentication.|null| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log. For good performance, it should be big enough to hold a substantial amount of entries in the flush interval.|25% of direct memory| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens. By default, it is allocated to 25% of the available direct memory.|N/A| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases. By default, it uses 10% of direct memory.|N/A| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|advertisedListeners|Specify multiple advertised listeners for the broker.

    The format is `:pulsar://:`.

    If there are multiple listeners, separate them with commas.

    **Note**: do not use this configuration with `advertisedAddress` and `brokerServicePort`. If the value of this configuration is empty, the broker uses `advertisedAddress` and `brokerServicePort`|/| +|internalListenerName|Specify the internal listener name for the broker.

    **Note**: the listener name must be contained in `advertisedListeners`.

    If the value of this configuration is empty, the broker uses the first listener as the internal listener.|/| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|exposePublisherStats|Whether to enable topic level metrics.|true| +|statsUpdateFrequencyInSecs||60| +|statsUpdateInitialDelayInSecs||60| +|metadataStoreUrl| Metadata store quorum connection string || +| metadataStoreConfigPath | The configuration file path of the local metadata store. See [Configure metadata store](administration-metadata-store.md) for details. |N/A| +|metadataStoreCacheExpirySeconds|Metadata store cache expiry time in seconds|300| +|configurationMetadataStoreUrl| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|webSocketNumIoThreads|The number of IO threads in Pulsar Client used in WebSocket proxy.|Runtime.getRuntime().availableProcessors()| +|webSocketConnectionsPerBroker|The number of connections per Broker in Pulsar Client used in WebSocket proxy.|Runtime.getRuntime().availableProcessors()| +|webSocketSessionIdleTimeoutMillis|Time in milliseconds that idle WebSocket session times out.|300000| +|webSocketMaxTextFrameSize|The maximum size of a text message during parsing in WebSocket proxy.|1048576| +|exposeTopicLevelMetricsInPrometheus|Whether to enable topic level metrics.|true| +|exposeConsumerLevelMetricsInPrometheus|Whether to enable consumer level metrics.|false| +|jvmGCMetricsLoggerClassName|Classname of Pluggable JVM GC metrics logger that can log GC specific metrics.|N/A| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|bindAddresses| Additional Hostname or IP addresses the service binds on: `listener_name:scheme://host:port,...`. || +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|maxTenants|The maximum number of tenants that can be created in each Pulsar cluster. When the number of tenants reaches the threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationSnapshotIntervalSeconds| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |120| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|brokerDeduplicationSnapshotFrequencyInSeconds| How often is the thread pool scheduled to check whether a snapshot needs to be taken. The value of `0` means it is disabled. |120| +|dispatchThrottlingRateInMsg| Dispatch throttling-limit of messages for a broker (per second). 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingRateInByte| Dispatch throttling-limit of bytes for a broker (per second). 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingRatePerTopicInMsg| Dispatch throttling-limit of messages for every topic (per second). 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingRatePerTopicInByte| Dispatch throttling-limit of bytes for every topic (per second). 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingOnBatchMessageEnabled|Apply dispatch rate limiting on batch message instead individual messages with in batch message. (Default is disabled). | false| +|dispatchThrottlingRateRelativeToPublishRate| Enable dispatch rate-limiting relative to publish rate. | false | +|dispatchThrottlingRatePerSubscriptionInMsg| Dispatch throttling-limit of messages for a subscription. 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingRatePerSubscriptionInByte|Dispatch throttling-limit of bytes for a subscription. 0 means the dispatch throttling-limit is disabled.|0| +|dispatchThrottlingRatePerReplicatorInMsg| The default messages per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message dispatch-throttling| 0 | +|dispatchThrottlingRatePerReplicatorInByte| The default bytes per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message-byte dispatch-throttling| 0 | +|metadataStoreSessionTimeoutMillis| Metadata store session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitBytes| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. | -1 | +|backlogQuotaDefaultRetentionPolicy|The defaulted backlog quota retention policy. By Default, it is `producer_request_hold`.
  • 'producer_request_hold' Policy which holds producer's send request until the resource becomes available (or holding times out)
  • 'producer_exception' Policy which throws `javax.jms.ResourceAllocationException` to the producer
  • 'consumer_backlog_eviction' Policy which evicts the oldest message from the slowest consumer's backlog
  • |producer_request_hold| +|allowAutoTopicCreation| Enable topic auto creation if a new producer or consumer connected |true| +|allowAutoTopicCreationType| The type of topic that is allowed to be automatically created.(partitioned/non-partitioned) |non-partitioned| +|allowAutoSubscriptionCreation| Enable subscription auto creation if a new consumer connected |true| +|defaultNumPartitions| The number of partitioned topics that is allowed to be automatically created if `allowAutoTopicCreationType` is partitioned |1| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +| brokerDeleteInactiveTopicsMode | Set the mode to delete inactive topics.
  • `delete_when_no_subscriptions`: delete the topic which has no subscriptions or active producers.
  • `delete_when_subscriptions_caught_up`: delete the topic whose subscriptions have no backlogs and which has no active producers or consumers.
  • | `delete_when_no_subscriptions` | +| brokerDeleteInactiveTopicsMaxInactiveDurationSeconds | Set the maximum duration for inactive topics. If it is not specified, the `brokerDeleteInactiveTopicsFrequencySeconds` parameter is adopted. | N/A | +|forceDeleteTenantAllowed| Enable you to delete a tenant forcefully. |false| +|forceDeleteNamespaceAllowed| Enable you to delete a namespace forcefully. |false| +|messageExpiryCheckIntervalInMinutes| The frequency of proactively checking and purging expired messages. |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to determine whether topics with compaction policies need compaction. |60| +brokerServiceCompactionThresholdInBytes|If the estimated backlog size is greater than this threshold, compression is triggered.

    Set this threshold to 0 means disabling the compression check.|N/A +|delayedDeliveryEnabled| Whether to enable the delayed delivery for messages. If disabled, messages will be immediately delivered and there will be no tracking overhead.|true| +|delayedDeliveryTickTimeMillis|Control the tick time for retrying on delayed delivery, which affects the accuracy of the delivery time compared to the scheduled time. By default, it is 1 second.|1000| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|maxNumPartitionsPerPartitionedTopic|Max number of partitions per partitioned topic. Use 0 or negative number to disable the check|0| +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file. This cert is used to verify that any certs presented by connecting clients are signed by a certificate authority. If this verification fails, then the certs are untrusted and the connections are dropped. || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client. If it is set to `true`, a client with a cert which cannot be verified with the 'tlsTrustCertsFilePath' cert will be allowed to connect to the server, though the cert will not be used for client authentication. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tlsEnabledWithKeyStore| Enable TLS with KeyStore type configuration in broker |false| +|tlsProvider| TLS Provider for KeyStore type || +|tlsKeyStoreType| LS KeyStore type configuration in broker: JKS, PKCS12 |JKS| +|tlsKeyStore| TLS KeyStore path in broker || +|tlsKeyStorePassword| TLS KeyStore password for broker || +|brokerClientTlsEnabledWithKeyStore| Whether internal client use KeyStore type to authenticate with Pulsar brokers |false| +|brokerClientSslProvider| The TLS Provider used by internal client to authenticate with other Pulsar brokers || +|brokerClientTlsTrustStoreType| TLS TrustStore type configuration for internal client: JKS, PKCS12, used by the internal client to authenticate with Pulsar brokers |JKS| +|brokerClientTlsTrustStore| TLS TrustStore path for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsTrustStorePassword| TLS TrustStore password for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsCiphers| Specify the tls cipher the internal client will use to negotiate during TLS Handshake. (a comma-separated list of ciphers) e.g. [TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256]|| +|brokerClientTlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS handshake. (a comma-separated list of protocol names). e.g. `TLSv1.3`, `TLSv1.2` || +| metadataStoreBatchingEnabled | Enable metadata operations batching. | true | +| metadataStoreBatchingMaxDelayMillis | Maximum delay to impose on batching grouping. | 5 | +| metadataStoreBatchingMaxOperations | Maximum number of operations to include in a singular batch. | 1000 | +| metadataStoreBatchingMaxSizeKb | Maximum size of a batch. | 128 | +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|tokenSettingPrefix| Configure the prefix of the token-related settings, such as `tokenSecretKey`, `tokenPublicKey`, `tokenAuthClaim`, `tokenPublicAlg`, `tokenAudienceClaim`, and `tokenAudience`. || +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify which of the token's claims will be used as the authentication "principal" or "role". The default "sub" claim will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud", that will be used to get the audience from token. If not set, audience will not be verified. || +|tokenAudience| The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token, need contains this. || +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|subscriptionRedeliveryTrackerEnabled| Enable subscription message redelivery tracker |true| +|subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consuming.

    Setting this configuration to a value **greater than 0** deletes inactive subscriptions automatically.
    Setting this configuration to **0** does not delete inactive subscriptions automatically.

    Since this configuration takes effect on all topics, if there is even one topic whose subscriptions should not be deleted automatically, you need to set it to 0.
    Instead, you can set a subscription expiration time for each **namespace** using the [`pulsar-admin namespaces set-subscription-expiration-time options` command](/tools/pulsar-admin/). | 0 | +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |50000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names || +| authenticationRefreshCheckSeconds | Interval of time for checking for expired authentication credentials | 60 | +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|schemaRegistryStorageClassName|The schema storage implementation used by this broker.|org.apache.pulsar.broker.service.schema.BookkeeperSchemaStorageFactory| +|isSchemaValidationEnforced| Whether to enable schema validation, when schema validation is enabled, if a producer without a schema attempts to produce the message to a topic with schema, the producer is rejected and disconnected.|false| +|isAllowAutoUpdateSchemaEnabled|Allow schema to be auto updated at broker level.|true| +|schemaCompatibilityStrategy| The schema compatibility strategy at broker level, see [here](schema-evolution-compatibility.md#schema-compatibility-check-strategy) for available values.|FULL| +|systemTopicSchemaCompatibilityStrategy| The schema compatibility strategy is used for system topics, see [here](schema-evolution-compatibility.md#schema-compatibility-check-strategy) for available values.|ALWAYS_COMPATIBLE| +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +|offloadersDirectory|The directory for all the offloader implementations.|./offloaders| +|bookkeeperMetadataServiceUri| Metadata service uri that bookkeeper is used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: zk+hierarchical://localhost:2181/ledgers. The metadata service uri list can also be semicolon separated values like below: zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperNumberOfChannelsPerBookie| Number of channels per bookie |16| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientRegionawarePolicyEnabled| Enable region-aware bookie selection policy. BK will chose bookies from different regions and racks when forming a new bookie ensemble. If enabled, the value of bookkeeperClientRackawarePolicyEnabled is ignored |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| Minimum number of racks per write quorum. BK rack-aware bookie selection policy will try to get bookies from at least 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a write quorum. |2| +|bookkeeperClientEnforceMinNumRacksPerWriteQuorum| Enforces rack-aware bookie selection policy to pick bookies from 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a writeQuorum. If BK can't find bookie then it would throw BKNotEnoughBookiesException instead of picking random one. |false| +|bookkeeperClientReorderReadSequenceEnabled| Enable/disable reordering read sequence on reading entries. |false| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +|bookkeeperClientGetBookieInfoIntervalSeconds| Set the interval to periodically check bookie info |86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds| Set the interval to retry a failed bookie info lookup |60| +|bookkeeperEnableStickyReads | Enable/disable having read operations for a ledger to be sticky to a single bookie. If this flag is enabled, the client will use one single bookie (by preference) to read all entries for a ledger. | true | +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker. By default, uses 1/5th of available direct memory || +|managedLedgerCacheCopyEntries| Whether we should make a copy of the entry payloads when inserting in cache| false| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| The max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered after the min rollover time has passed and one of the following conditions is true:
    • The max rollover time has been reached
    • The max entries have been written to the ledger
    • The max ledger size has been written to the ledger
    |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerInactiveLedgerRolloverTimeSeconds| Time to rollover ledger for inactive topic |0| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| Maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more than once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quota |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|loadBalancerLoadSheddingStrategy | The shedding strategy of load balance.

    Available values:
  • `org.apache.pulsar.broker.loadbalance.impl.ThresholdShedder`
  • `org.apache.pulsar.broker.loadbalance.impl.OverloadShedder`
  • `org.apache.pulsar.broker.loadbalance.impl.UniformLoadShedder`

  • For the comparisons of the shedding strategies, see [here](administration-load-balance/#shed-load-automatically).|`org.apache.pulsar.broker.loadbalance.impl.ThresholdShedder` +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|transactionCoordinatorEnabled|Whether to enable transaction coordinator in broker.|true| +|transactionMetadataStoreProviderClassName| |org.apache.pulsar.transaction.coordinator.impl.InMemTransactionMetadataStoreProvider| +|defaultRetentionTimeInMinutes| Default message retention time |0| +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|bootstrapNamespaces| The bootstrap name. | N/A | +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|supportedNamespaceBundleSplitAlgorithms| Supported algorithms name for namespace bundle split |[range_equally_divide,topic_count_equally_divide]| +|defaultNamespaceBundleSplitAlgorithm| Default algorithm name for namespace bundle split |range_equally_divide| +|managedLedgerOffloadDriver| The directory for all the offloader implementations `offloadersDirectory=./offloaders`. Driver to use to offload old data to long term storage (Possible values: S3, aws-s3, google-cloud-storage). When using google-cloud-storage, Make sure both Google Cloud Storage and Google Cloud Storage JSON API are enabled for the project (check from Developers Console -> Api&auth -> APIs). || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|managedLedgerOffloadPrefetchRounds|The maximum prefetch rounds for ledger reading for offloading.|1| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerOffloadDeletionLagMs|Delay between a ledger being successfully offloaded to long term storage and the ledger being deleted from bookkeeper | 14400000| +|managedLedgerOffloadAutoTriggerSizeThresholdBytes|The number of bytes before triggering automatic offload to long term storage |-1 (disabled)| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| +|gcsManagedLedgerOffloadRegion|For Google Cloud Storage ledger offload, region where offload bucket is located. Go to this page for more details: https://cloud.google.com/storage/docs/bucket-locations .|N/A| +|gcsManagedLedgerOffloadBucket|For Google Cloud Storage ledger offload, Bucket to place offloaded ledger into.|N/A| +|gcsManagedLedgerOffloadMaxBlockSizeInBytes|For Google Cloud Storage ledger offload, the maximum block size in bytes. (64MB by default, 5MB minimum)|67108864| +|gcsManagedLedgerOffloadReadBufferSizeInBytes|For Google Cloud Storage ledger offload, Read buffer size in bytes. (1MB by default)|1048576| +|gcsManagedLedgerOffloadServiceAccountKeyFile|For Google Cloud Storage, path to json file containing service account credentials. For more details, see the "Service Accounts" section of https://support.google.com/googleapi/answer/6158849 .|N/A| +|fileSystemProfilePath|For File System Storage, file system profile path.|../conf/filesystem_offload_core_site.xml| +|fileSystemURI|For File System Storage, file system uri.|N/A| +|s3ManagedLedgerOffloadRole| For Amazon S3 ledger offload, provide a role to assume before writing to s3 || +|s3ManagedLedgerOffloadRoleSessionName| For Amazon S3 ledger offload, provide a role session name when using a role |pulsar-s3-offload| +| acknowledgmentAtBatchIndexLevelEnabled | Enable or disable the batch index acknowledgement. | false | +|enableReplicatedSubscriptions|Whether to enable tracking of replicated subscriptions state across clusters.|true| +|replicatedSubscriptionsSnapshotFrequencyMillis|The frequency of snapshots for replicated subscriptions tracking.|1000| +|replicatedSubscriptionsSnapshotTimeoutSeconds|The timeout for building a consistent snapshot for tracking replicated subscriptions state.|30| +|replicatedSubscriptionsSnapshotMaxCachedPerSubscription|The maximum number of snapshot to be cached per subscription.|10| +|maxMessagePublishBufferSizeInMB|The maximum memory size for a broker to handle messages that are sent by producers. If the processing message size exceeds this value, the broker stops reading data from the connection. The processing messages refer to the messages that are sent to the broker but the broker has not sent response to the client. Usually the messages are waiting to be written to bookies. It is shared across all the topics running in the same broker. The value `-1` disables the memory limitation. By default, it is 50% of direct memory.|N/A| +|messagePublishBufferCheckIntervalInMillis|Interval between checks to see if message publish buffer size exceeds the maximum. Use `0` or negative number to disable the max publish buffer limiting.|100| +|retentionCheckIntervalInSeconds|Check between intervals to see if consumed ledgers need to be trimmed. Use 0 or negative number to disable the check.|120| +| maxMessageSize | Set the maximum size of a message. | 5242880 | +| preciseTopicPublishRateLimiterEnable | Enable precise topic publish rate limiting. | false | +| lazyCursorRecovery | Whether to recover cursors lazily when trying to recover a managed ledger backing a persistent topic. It can improve write availability of topics. The caveat is now when recovered ledger is ready to write we're not sure if all old consumers' last mark delete position(ack position) can be recovered or not. So user can make the trade off or have custom logic in application to checkpoint consumer state.| false | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| maxNamespacesPerTenant | The maximum number of namespaces that can be created in each tenant. When the number of namespaces reaches this threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +|subscriptionTypesEnabled| Enable all subscription types, which are exclusive, shared, failover, and key_shared. | Exclusive, Shared, Failover, Key_Shared | +| managedLedgerInfoCompressionType | Compression type of managed ledger information.

    Available options are `NONE`, `LZ4`, `ZLIB`, `ZSTD`, and `SNAPPY`).

    If this value is `NONE` or invalid, the `managedLedgerInfo` is not compressed.

    **Note** that after enabling this configuration, if you want to degrade a broker, you need to change the value to `NONE` and make sure all ledger metadata is saved without compression. | None | +| additionalServlets | Additional servlet name.

    If you have multiple additional servlets, separate them by commas.

    For example, additionalServlet_1, additionalServlet_2 | N/A | +| additionalServletDirectory | Location of broker additional servlet NAR directory | ./brokerAdditionalServlet | +| brokerEntryMetadataInterceptors | Set broker entry metadata interceptors.

    Multiple interceptors should be separated by commas.

    Available values:
  • org.apache.pulsar.common.intercept.AppendBrokerTimestampMetadataInterceptor
  • org.apache.pulsar.common.intercept.AppendIndexMetadataInterceptor


  • Example
    brokerEntryMetadataInterceptors=org.apache.pulsar.common.intercept.AppendBrokerTimestampMetadataInterceptor, org.apache.pulsar.common.intercept.AppendIndexMetadataInterceptor|N/A | +| enableExposingBrokerEntryMetadataToClient|Whether to expose broker entry metadata to client or not.

    Available values:
  • true
  • false

  • Example
    enableExposingBrokerEntryMetadataToClient=true | false | +| strictBookieAffinityEnabled | Enable or disable the strict bookie isolation strategy. If enabled,
    - `bookie-ensemble` first tries to choose bookies that belong to a namespace's affinity group. If the number of bookies is not enough, then the rest bookies are chosen.
    - If namespace has no affinity group, `bookie-ensemble` only chooses bookies that belong to no region. If the number of bookies is not enough, `BKNotEnoughBookiesException` is thrown.| false | +|narExtractionDirectory | The extraction directory of the nar package.
    Available for Protocol Handler, Additional Servlets, Entry Filter, Offloaders, Broker Interceptor. | System.getProperty("java.io.tmpdir") | + +#### Configuration override for clients internal to broker + +In 2.10.1 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients and Pulsar Admin Clients. These configurations are applied after hard coded configuration and before the above broker client configurations named above.| +|bookkeeper_| Configure the broker's BookKeeper clients used by managed ledgers and the BookkeeperPackagesStorage bookkeeper client. Takes precedence over most other configuration values.| + +:::note + +When running the function worker within the broker, these prefixed configurations do not apply to any of those clients. You must configure those clients using the `functions_worker.yml` file. + +::: + +#### Deprecated parameters of Broker +The following parameters have been deprecated in the `conf/broker.conf` file. + +|Name|Description|Default| +|---|---|---| +|backlogQuotaDefaultLimitGB| Use `backlogQuotaDefaultLimitBytes` instead. |-1| +|brokerServicePurgeInactiveFrequencyInSeconds| Use `brokerDeleteInactiveTopicsFrequencySeconds`.|60| +|tlsEnabled| Use `webServicePortTls` and `brokerServicePortTls` instead. |false| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages. Use `brokerClientTlsEnabled` instead. |false| +|subscriptionKeySharedEnable| Whether to enable the Key_Shared subscription. Use `subscriptionTypesEnabled` instead. |true| +|zookeeperServers| Zookeeper quorum connection string. Use `metadataStoreUrl` instead. |N/A| +|configurationStoreServers| Configuration store connection string (as a comma-separated list). Use `configurationMetadataStoreUrl` instead. |N/A| +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds. Use `metadataStoreSessionTimeoutMillis` instead. |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds. Use `metadataStoreCacheExpirySeconds` instead.|300| + + +## Client + +You can use the [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool to publish messages to and consume messages from Pulsar topics. You can use this tool in place of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether to enforce the TLS authentication in the cluster. |false| +| tlsAllowInsecureConnection | Allow TLS connections to servers whose certificate cannot be verified to have been signed by a trusted certificate authority. | false | +| tlsEnableHostnameVerification | Whether the server hostname must match the common name of the certificate that is used by the server. | false | +|tlsTrustCertsFilePath||| +| useKeyStoreTls | Enable TLS with KeyStore type configuration in the broker. | false | +| tlsTrustStoreType | TLS TrustStore type configuration.
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path. | | +| tlsTrustStorePassword | TLS TrustStore password. | | + + + + + + +## Log4j + +You can set the log level and configuration in the [log4j2.yaml](https://github.com/apache/pulsar/blob/d557e0aa286866363bc6261dec87790c055db1b0/conf/log4j2.yaml#L155) file. The following logging configuration parameters are available. + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + +:::note + +'topic' in log4j2.appender is configurable. +- If you want to append all logs to a single topic, set the same topic name. +- If you want to append logs to different topics, you can set different topic names. + +::: + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|metadataStoreUrl| The quorum connection string for local metadata store || +|metadataStoreCacheExpirySeconds| Metadata store cache expiry time in seconds|300| +|configurationMetadataStoreUrl| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| The port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|bindAddresses| Additional Hostname or IP addresses the service binds on: `listener_name:scheme://host:port,...`. || +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +| numAcceptorThreads | Number of threads to use for Netty Acceptor | 1 | +| numIOThreads | Number of threads to use for Netty IO | 2 * Runtime.getRuntime().availableProcessors() | +| numHttpServerThreads | Number of threads to use for HTTP requests processing | 2 * Runtime.getRuntime().availableProcessors()| +|isRunningStandalone|This flag controls features that are meant to be used when running in standalone mode.|N/A| +|clusterName| The name of the cluster that this broker belongs to. |standalone| +| failureDomainsEnabled | Enable cluster's failure-domain which can distribute brokers into logical region. | false | +|metadataStoreSessionTimeoutMillis| Metadata store session timeout, in milliseconds. |30000| +|metadataStoreOperationTimeoutSeconds|Metadata store operation timeout in seconds.|30| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitBytes| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. |-1| +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +| maxPendingPublishRequestsPerConnection | Maximum pending publish requests per connection to avoid keeping large number of pending requests in memory | 1000| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +| subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consumption. When it is set to 0, inactive subscriptions are not deleted automatically | 0 | +| subscriptionRedeliveryTrackerEnabled | Enable subscription message redelivery tracker to send redelivery count to consumer. | true | +| subscriptionKeySharedUseConsistentHashing | In Key_Shared subscription type, with default AUTO_SPLIT mode, use splitting ranges or consistent hashing to reassign keys to new consumers. | false | +| subscriptionKeySharedConsistentHashingReplicaPoints | In Key_Shared subscription type, the number of points in the consistent-hashing ring. The greater the number, the more equal the assignment of keys to consumers. | 100 | +| subscriptionExpiryCheckIntervalInMinutes | How frequently to proactively check and purge expired subscription |5 | +| brokerDeduplicationEnabled | Set the default behavior for message deduplication in the broker. This can be overridden per-namespace. If it is enabled, the broker rejects messages that are already stored in the topic. | false | +| brokerDeduplicationMaxNumberOfProducers | Maximum number of producer information that it's going to be persisted for deduplication purposes | 10000 | +| brokerDeduplicationEntriesInterval | Number of entries after which a deduplication information snapshot is taken. A greater interval leads to less snapshots being taken though it would increase the topic recovery time, when the entries published after the snapshot need to be replayed. | 1000 | +| brokerDeduplicationProducerInactivityTimeoutMinutes | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | 360 | +| defaultNumberOfNamespaceBundles | When a namespace is created without specifying the number of bundles, this value is used as the default setting.| 4 | +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +| maxUnackedMessagesPerBroker | Maximum number of unacknowledged messages allowed per broker. Once this limit reaches, the broker stops dispatching messages to all shared subscriptions which has a higher number of unacknowledged messages until subscriptions start acknowledging messages back and unacknowledged messages count reaches to limit/2. When the value is set to 0, unacknowledged message limit check is disabled and broker does not block dispatchers. | 0 | +| maxUnackedMessagesPerSubscriptionOnBrokerBlocked | Once the broker reaches maxUnackedMessagesPerBroker limit, it blocks subscriptions which have higher unacknowledged messages than this percentage limit and subscription does not receive any new messages until that subscription acknowledges messages back. | 0.16 | +| unblockStuckSubscriptionEnabled|Broker periodically checks if subscription is stuck and unblock if flag is enabled.|false| +| topicPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks topic publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. (Disable publish throttling with value 0) | 10| +| brokerPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks broker publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. When the value is set to 0, publish throttling is disabled. |50 | +| brokerPublisherThrottlingMaxMessageRate | Maximum rate (in 1 second) of messages allowed to publish for a broker if the message rate limiting is enabled. When the value is set to 0, message rate limiting is disabled. | 0| +| brokerPublisherThrottlingMaxByteRate | Maximum rate (in 1 second) of bytes allowed to publish for a broker if the byte rate limiting is enabled. When the value is set to 0, the byte rate limiting is disabled. | 0 | +|subscribeThrottlingRatePerConsumer|Too many subscribe requests from a consumer can cause broker rewinding consumer cursors and loading data from bookies, hence causing high network bandwidth usage. When the positive value is set, broker will throttle the subscribe requests for one consumer. Otherwise, the throttling will be disabled. By default, throttling is disabled.|0| +|subscribeRatePeriodPerConsumerInSecond|Rate period for {subscribeThrottlingRatePerConsumer}. By default, it is 30s.|30| +|dispatchThrottlingRateInMsg| Dispatch throttling-limit of messages for a broker (per second). 0 means the dispatch throttling-limit is disabled. |0| +|dispatchThrottlingRateInByte| Dispatch throttling-limit of bytes for a broker (per second). 0 means the dispatch throttling-limit is disabled. |0| +| dispatchThrottlingRatePerTopicInMsg | Default messages (per second) dispatch throttling-limit for every topic. When the value is set to 0, default message dispatch throttling-limit is disabled. |0 | +| dispatchThrottlingRatePerTopicInByte | Default byte (per second) dispatch throttling-limit for every topic. When the value is set to 0, default byte dispatch throttling-limit is disabled. | 0| +| dispatchThrottlingOnBatchMessageEnabled |Apply dispatch rate limiting on batch message instead individual messages with in batch message. (Default is disabled). | false| +| dispatchThrottlingRateRelativeToPublishRate | Enable dispatch rate-limiting relative to publish rate. | false | +|dispatchThrottlingRatePerSubscriptionInMsg|The defaulted number of message dispatching throttling-limit for a subscription. The value of 0 disables message dispatch-throttling.|0| +|dispatchThrottlingRatePerSubscriptionInByte|The default number of message-bytes dispatching throttling-limit for a subscription. The value of 0 disables message-byte dispatch-throttling.|0| +|dispatchThrottlingRatePerReplicatorInMsg| Dispatch throttling-limit of messages for every replicator in replication (per second). 0 means the dispatch throttling-limit in replication is disabled. |0| +|dispatchThrottlingRatePerReplicatorInByte| Dispatch throttling-limit of bytes for every replicator in replication (per second). 0 means the dispatch throttling-limit is disabled. |0| +| dispatchThrottlingOnNonBacklogConsumerEnabled | Enable dispatch-throttling for both caught up consumers as well as consumers who have backlogs. | true | +|dispatcherMaxReadBatchSize|The maximum number of entries to read from BookKeeper. By default, it is 100 entries.|100| +|dispatcherMaxReadSizeBytes|The maximum size in bytes of entries to read from BookKeeper. By default, it is 5MB.|5242880| +|dispatcherMinReadBatchSize|The minimum number of entries to read from BookKeeper. By default, it is 1 entry. When there is an error occurred on reading entries from bookkeeper, the broker will backoff the batch size to this minimum number.|1| +|dispatcherMaxRoundRobinBatchSize|The maximum number of entries to dispatch for a shared subscription. By default, it is 20 entries.|20| +| preciseDispatcherFlowControl | Precise dispathcer flow control according to history message number of each entry. | false | +| streamingDispatch | Whether to use streaming read dispatcher. It can be useful when there's a huge backlog to drain and instead of read with micro batch we can streamline the read from bookkeeper to make the most of consumer capacity till we hit bookkeeper read limit or consumer process limit, then we can use consumer flow control to tune the speed. This feature is currently in preview and can be changed in subsequent release. | false | +| maxConcurrentLookupRequest | Maximum number of concurrent lookup request that the broker allows to throttle heavy incoming lookup traffic. | 50000 | +| maxConcurrentTopicLoadRequest | Maximum number of concurrent topic loading request that the broker allows to control the number of zk-operations. | 5000 | +| maxConcurrentNonPersistentMessagePerConnection | Maximum number of concurrent non-persistent message that can be processed per connection. | 1000 | +| numWorkerThreadsForNonPersistentTopic | Number of worker threads to serve non-persistent topic. | 8 | +| enablePersistentTopics | Enable broker to load persistent topics. | true | +| enableNonPersistentTopics | Enable broker to load non-persistent topics. | true | +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxNumPartitionsPerPartitionedTopic | Maximum number of partitions per partitioned topic. When the value is set to a negative number or is set to 0, the check is disabled. | 0 | +| metadataStoreBatchingEnabled | Enable metadata operations batching. | true | +| metadataStoreBatchingMaxDelayMillis | Maximum delay to impose on batching grouping. | 5 | +| metadataStoreBatchingMaxOperations | Maximum number of operations to include in a singular batch. | 1000 | +| metadataStoreBatchingMaxSizeKb | Maximum size of a batch. | 128 | +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. When the value is set to 0, check the TLS certificate on every new connection. | 300 | +| tlsCertificateFilePath | Path for the TLS certificate file. | | +| tlsKeyFilePath | Path for the TLS private key file. | | +| tlsTrustCertsFilePath | Path for the trusted TLS certificate file.| | +| tlsAllowInsecureConnection | Accept untrusted TLS certificate from the client. If it is set to true, a client with a certificate which cannot be verified with the 'tlsTrustCertsFilePath' certificate is allowed to connect to the server, though the certificate is not be used for client authentication. | false | +| tlsProtocols | Specify the TLS protocols the broker uses to negotiate during TLS handshake. | | +| tlsCiphers | Specify the TLS cipher the broker uses to negotiate during TLS Handshake. | | +| tlsRequireTrustedClientCertOnConnect | Trusted client certificates are required for to connect TLS. Reject the Connection if the client certificate is not trusted. In effect, this requires that all connecting clients perform TLS client authentication. | false | +| tlsEnabledWithKeyStore | Enable TLS with KeyStore type configuration in broker. | false | +| tlsProvider | TLS Provider for KeyStore type. | | +| tlsKeyStoreType | TLS KeyStore type configuration in the broker.
  • JKS
  • PKCS12
  • |JKS| +| tlsKeyStore | TLS KeyStore path in the broker. | | +| tlsKeyStorePassword | TLS KeyStore password for the broker. | | +| tlsTrustStoreType | TLS TrustStore type configuration in the broker
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path in the broker. | | +| tlsTrustStorePassword | TLS TrustStore password for the broker. | | +| brokerClientTlsEnabledWithKeyStore | Configure whether the internal client uses the KeyStore type to authenticate with Pulsar brokers. | false | +| brokerClientSslProvider | The TLS Provider used by the internal client to authenticate with other Pulsar brokers. | | +| brokerClientTlsTrustStoreType | TLS TrustStore type configuration for the internal client to authenticate with Pulsar brokers.
  • JKS
  • PKCS12
  • | JKS | +| brokerClientTlsTrustStore | TLS TrustStore path for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsTrustStorePassword | TLS TrustStore password for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsCiphers | Specify the TLS cipher that the internal client uses to negotiate during TLS Handshake. | | +| brokerClientTlsProtocols | Specify the TLS protocols that the broker uses to negotiate during TLS handshake. | | +| systemTopicEnabled | Enable/Disable system topics. | false | +| topicLevelPoliciesEnabled | Enable or disable topic level policies. Topic level policies depends on the system topic. Please enable the system topic first. | false | +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +| proxyRoles | Role names that are treated as "proxy roles". If the broker sees a request with role as proxyRoles, it demands to see a valid original principal. | | +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +| authorizationProvider | Authorization provider fully qualified class-name. | org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider | +| authorizationAllowWildcardsMatching | Allow wildcard matching in authorization. Wildcard matching is applicable only when the wildcard-character (*) presents at the **first** or **last** position. | false | +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. | | +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. | | +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. | | +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. | | +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|tokenSettingPrefix| Configure the prefix of the token related setting like `tokenSecretKey`, `tokenPublicKey`, `tokenAuthClaim`, `tokenPublicAlg`, `tokenAudienceClaim`, and `tokenAudience`. || +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|saslJaasClientAllowedIds|This is a regexp, which limits the range of possible ids which can connect to the Broker using SASL. By default, it is set to `SaslConstants.JAAS_CLIENT_ALLOWED_IDS_DEFAULT`, which is ".*pulsar.*", so only clients whose id contains 'pulsar' are allowed to connect.|N/A| +|saslJaasBrokerSectionName|Service Principal, for login context name. By default, it is set to `SaslConstants.JAAS_DEFAULT_BROKER_SECTION_NAME`, which is "Broker".|N/A| +|httpMaxRequestSize|If the value is larger than 0, it rejects all HTTP requests with bodies larged than the configured limit.|-1| +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|bookkeeperMetadataServiceUri|Metadata service uri is what BookKeeper used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: `zk+hierarchical://localhost:2181/ledgers`. The metadata service uri list can also be semicolon separated values like: `zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers`.|N/A| +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperUseV2WireProtocol|Use older Bookkeeper wire protocol with bookie.|true| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientGetBookieInfoIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|60| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientRegionawarePolicyEnabled| |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| |2| +|bookkeeperClientMinNumRacksPerWriteQuorum| |false| +|bookkeeperClientReorderReadSequenceEnabled| |false| +|bookkeeperClientIsolationGroups||| +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +| bookkeeperTLSProviderFactoryClass | Set the client security provider factory class name. | org.apache.bookkeeper.tls.TLSContextFactory | +| bookkeeperTLSClientAuthentication | Enable TLS authentication with bookie. | false | +| bookkeeperTLSKeyFileType | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSTrustCertTypes | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSKeyStorePasswordPath | Path to file containing keystore password, if the client keystore is password protected. | | +| bookkeeperTLSTrustStorePasswordPath | Path to file containing truststore password, if the client truststore is password protected. | | +| bookkeeperTLSKeyFilePath | Path for the TLS private key file. | | +| bookkeeperTLSCertificateFilePath | Path for the TLS certificate file. | | +| bookkeeperTLSTrustCertsFilePath | Path for the trusted TLS certificate file. | | +| bookkeeperTlsCertFilesRefreshDurationSeconds | Tls cert refresh duration at bookKeeper-client in seconds (0 to disable check). | | +| bookkeeperDiskWeightBasedPlacementEnabled | Enable/Disable disk weight based placement. | false | +| bookkeeperExplicitLacIntervalInMills | Set the interval to check the need for sending an explicit LAC. When the value is set to 0, no explicit LAC is sent. | 0 | +| bookkeeperClientExposeStatsToPrometheus | Expose BookKeeper client managed ledger stats to Prometheus. | false | +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +| managedLedgerDigestType | Default type of checksum to use when writing to BookKeeper. | CRC32C | +| managedLedgerNumSchedulerThreads | Number of threads to be used for managed ledger scheduled tasks. | Runtime.getRuntime().availableProcessors() | +|managedLedgerCacheSizeMB| |N/A| +|managedLedgerCacheCopyEntries| Whether to copy the entry payloads when inserting in cache.| false| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +| managedLedgerMaxSizePerLedgerMbytes | Maximum ledger size before triggering a rollover for a topic. | 2048 | +| managedLedgerMaxUnackedRangesToPersist | Maximum number of "acknowledgment holes" that are going to be persistently stored. When acknowledging out of order, a consumer leaves holes that are supposed to be quickly filled by acknowledging all the messages. The information of which messages are acknowledged is persisted by compressing in "ranges" of messages that were acknowledged. After the max number of ranges is reached, the information is only tracked in memory and messages are redelivered in case of crashes. | 10000 | +| managedLedgerMaxUnackedRangesToPersistInZooKeeper | Maximum number of "acknowledgment holes" that can be stored in Zookeeper. If the number of unacknowledged message range is higher than this limit, the broker persists unacknowledged ranges into bookkeeper to avoid additional data overhead into Zookeeper. | 1000 | +|autoSkipNonRecoverableData| |false| +| managedLedgerMetadataOperationsTimeoutSeconds | Operation timeout while updating managed-ledger metadata. | 60 | +| managedLedgerReadEntryTimeoutSeconds | Read entries timeout when the broker tries to read messages from BookKeeper. | 0 | +| managedLedgerAddEntryTimeoutSeconds | Add entry timeout when the broker tries to publish message to BookKeeper. | 0 | +| managedLedgerNewEntriesCheckDelayInMillis | New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor tries to check again after the delay time. For consumption latency sensitive scenario, you can set the value to a smaller value or 0. Of course, a smaller value may degrade consumption throughput.|10 ms| +| managedLedgerPrometheusStatsLatencyRolloverSeconds | Managed ledger prometheus stats latency rollover seconds. | 60 | +| managedLedgerTraceTaskExecution | Whether to trace managed ledger task execution time. | true | +|managedLedgerNewEntriesCheckDelayInMillis|New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor will try to check again after the delay time. For consumption latency sensitive scenario, it can be set to a smaller value or 0. A smaller value degrades consumption throughput. By default, it is 10ms.|10| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +| loadBalancerAutoUnloadSplitBundlesEnabled | Enable/Disable automatic unloading of split bundles. | true | +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| Maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered.
    To disable the threshold check, set the value to -1. |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +| loadBalancerBrokerThresholdShedderPercentage | The broker resource usage threshold. When the broker resource usage is greater than the pulsar cluster average resource usage, the threshold shedder is triggered to offload bundles from the broker. It only takes effect in the ThresholdShedder strategy. | 10 | +| loadBalancerMsgRateDifferenceShedderThreshold | Message-rate percentage threshold between highest and least loaded brokers for uniform load shedding. | 50 | +| loadBalancerMsgThroughputMultiplierDifferenceShedderThreshold | Message-throughput threshold between highest and least loaded brokers for uniform load shedding. | 4 | +| loadBalancerHistoryResourcePercentage | The history usage when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 0.9 | +| loadBalancerBandwithInResourceWeight | The BandWithIn usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBandwithOutResourceWeight | The BandWithOut usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerCPUResourceWeight | The CPU usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerMemoryResourceWeight | The heap memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerDirectMemoryResourceWeight | The direct memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBundleUnloadMinThroughputThreshold | Bundle unload minimum throughput threshold. Avoid bundle unload frequently. It only takes effect in the ThresholdShedder strategy. | 10 | +| namespaceBundleUnloadingTimeoutMs | Time to wait for the unloading of a namespace bundle in milliseconds. | 60000 | +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +| replicationPolicyCheckDurationSeconds | Duration to check replication policy to avoid replicator inconsistency due to missing ZooKeeper watch. When the value is set to 0, disable checking replication policy. | 600 | +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for a bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`).

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|/| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +| metadataStoreConfigPath | The configuration file path of the local metadata store. See [Configure metadata store](administration-metadata-store.md) for details. |N/A| +|schemaRegistryStorageClassName|The schema storage implementation used by this broker.|org.apache.pulsar.broker.service.schema.BookkeeperSchemaStorageFactory| +|isSchemaValidationEnforced| Whether to enable schema validation, when schema validation is enabled, if a producer without a schema attempts to produce the message to a topic with schema, the producer is rejected and disconnected.|false| +|isAllowAutoUpdateSchemaEnabled|Allow schema to be auto updated at broker level.|true| +|schemaCompatibilityStrategy| The schema compatibility strategy at broker level, see [here](schema-evolution-compatibility.md#schema-compatibility-check-strategy) for available values.|FULL| +|systemTopicSchemaCompatibilityStrategy| The schema compatibility strategy is used for system topics, see [here](schema-evolution-compatibility.md#schema-compatibility-check-strategy) for available values.|ALWAYS_COMPATIBLE| + +#### Deprecated parameters of standalone Pulsar +The following parameters have been deprecated in the `conf/standalone.conf` file. + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The quorum connection string for local metadata store. Use `metadataStoreUrl` instead. |N/A| +|configurationStoreServers| Configuration store connection string (as a comma-separated list). Use `configurationMetadataStoreUrl` instead. |N/A| +|zooKeeperOperationTimeoutSeconds|ZooKeeper operation timeout in seconds. Use `metadataStoreOperationTimeoutSeconds` instead. |30| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds. Use `metadataStoreCacheExpirySeconds` instead. |300| +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. Use `metadataStoreSessionTimeoutMillis` instead. |30000| + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|configurationMetadataStoreUrl |Configuration store connection string. |N/A| +|metadataStoreSessionTimeoutMillis|Metadata store session timeout in milliseconds. |30000| +|metadataStoreCacheExpirySeconds|Metadata store cache expiry time in seconds|300| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + +#### Configuration Override For Clients Internal to WebSocket + +In 2.10.1 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +#### Deprecated parameters of WebSocket +The following parameters have been deprecated in the `conf/websocket.conf` file. + +|Name|Description|Default| +|---|---|---| +|zooKeeperSessionTimeoutMillis|The ZooKeeper session timeout in milliseconds. Use `metadataStoreSessionTimeoutMillis` instead. |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds. Use `metadataStoreCacheExpirySeconds` instead.|300| +|configurationStoreServers| Configuration Store connection string. Use `configurationMetadataStoreUrl` instead.|N/A| + +## Pulsar proxy + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|forwardAuthorizationCredentials| Forward client authorization credentials to Broker for re-authorization, and make sure authentication is enabled for this to take effect. |false| +|metadataStoreUrl| Metadata store quorum connection string (as a comma-separated list) || +|configurationMetadataStoreUrl| Configuration store connection string (as a comma-separated list) || +| brokerServiceURL | The service URL pointing to the broker cluster. Must begin with `pulsar://`. | | +| brokerServiceURLTLS | The TLS service URL pointing to the broker cluster. Must begin with `pulsar+ssl://`. | | +| brokerWebServiceURL | The Web service URL pointing to the broker cluster | | +| brokerWebServiceURLTLS | The TLS Web service URL pointing to the broker cluster | | +| functionWorkerWebServiceURL | The Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +| functionWorkerWebServiceURLTLS | The TLS Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +|metadataStoreSessionTimeoutMillis| Metadata store session timeout (in milliseconds) |30000| +|metadataStoreCacheExpirySeconds|Metadata store cache expiry time in seconds|300| +|advertisedAddress|Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostname()` is used.|N/A| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +| proxyLogLevel | Proxy log level
  • 0: Do not log any TCP channel information.
  • 1: Parse and log any TCP channel information and command information without message body.
  • 2: Parse and log channel information, command information and message body.
  • | 0 | +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |50000| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers. |false| +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. If the value is set 0, check TLS certificate every new connection. | 300 | +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +| httpReverseProxyConfigs | HTTP directs to redirect to non-pulsar services | | +| httpOutputBufferSize | HTTP output buffer size. The amount of data that will be buffered for HTTP requests before it is flushed to the channel. A larger buffer size may result in higher HTTP throughput though it may take longer for the client to see data. If using HTTP streaming via the reverse proxy, this should be set to the minimum value (1) so that clients see the data as soon as possible. | 32768 | +| httpNumThreads | Number of threads to use for HTTP requests processing| 2 * Runtime.getRuntime().availableProcessors() | +|tokenSettingPrefix| Configure the prefix of the token related setting like `tokenSecretKey`, `tokenPublicKey`, `tokenAuthClaim`, `tokenPublicAlg`, `tokenAudienceClaim`, and `tokenAudience`. || +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| numIOThreads | Number of threads used for Netty IO.
    **Note:** This configuration is only available for 2.10.1 and later versions. | 2 * Runtime.getRuntime().availableProcessors() | +| numAcceptorThreads | Number of threads used for Netty Acceptor.
    **Note:** This configuration is only available for 2.10.1 and later versions. | 1 | + +#### Configuration Override For Clients Internal to Proxy + +In 2.10.1 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the proxy's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +#### Deprecated parameters of Pulsar proxy +The following parameters have been deprecated in the `conf/proxy.conf` file. + +|Name|Description|Default| +|---|---|---| +|tlsEnabledInProxy| Deprecated - use `servicePortTls` and `webServicePortTls` instead. |false| +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds). Use `metadataStoreSessionTimeoutMillis` instead. |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds. Use `metadataStoreCacheExpirySeconds` instead.|300| + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|admin.enableServer|The port at which the admin listens.|true| +|admin.serverPort|The port at which the admin listens.|9990| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|forceSync|Requires updates to be synced to media of the transaction log before finishing processing the update. If this option is set to 'no', ZooKeeper will not require updates to be synced to the media. WARNING: it's not recommended to run a production ZK cluster with `forceSync` disabled.|yes| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/reference-connector-admin.md b/site2/website/versioned_docs/version-2.10.x/reference-connector-admin.md new file mode 100644 index 0000000000000..2a7c1d82adba2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-connector-admin.md @@ -0,0 +1,12 @@ +--- +id: reference-connector-admin +title: Connector Admin CLI +sidebar_label: "Connector Admin CLI" +original_id: reference-connector-admin +--- + +> **Important** +> +> For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](/tools/pulsar-admin/). +> + diff --git a/site2/website/versioned_docs/version-2.10.x/reference-metrics.md b/site2/website/versioned_docs/version-2.10.x/reference-metrics.md new file mode 100644 index 0000000000000..c0c67c3bfd2e0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-metrics.md @@ -0,0 +1,617 @@ +--- +id: reference-metrics +title: Pulsar Metrics +sidebar_label: "Pulsar Metrics" +original_id: reference-metrics +--- + + + +Pulsar exposes the following metrics in Prometheus format. You can monitor your clusters with those metrics. + +* [ZooKeeper](#zookeeper) +* [BookKeeper](#bookkeeper) +* [Broker](#broker) +* [Pulsar Functions](#pulsar-functions) +* [Proxy](#proxy) +* [Pulsar SQL Worker](#pulsar-sql-worker) +* [Pulsar transaction](#pulsar-transaction) + +The following types of metrics are available: + +- [Counter](https://prometheus.io/docs/concepts/metric_types/#counter): a cumulative metric that represents a single monotonically increasing counter. The value increases by default. You can reset the value to zero or restart your cluster. +- [Gauge](https://prometheus.io/docs/concepts/metric_types/#gauge): a metric that represents a single numerical value that can arbitrarily go up and down. +- [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram): a histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. The `_bucket` suffix is the number of observations within a histogram bucket, configured with parameter `{le=""}`. The `_count` suffix is the number of observations, shown as a time series and behaves like a counter. The `_sum` suffix is the sum of observed values, also shown as a time series and behaves like a counter. These suffixes are together denoted by `_*` in this doc. +- [Summary](https://prometheus.io/docs/concepts/metric_types/#summary): similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window. + +## ZooKeeper + +The ZooKeeper metrics are exposed under "/metrics" at port `8000`. You can use a different port by configuring the `metricsProvider.httpPort` in conf/zookeeper.conf. + +ZooKeeper provides a New Metrics System since 3.6.0. For more detailed metrics, refer to the [ZooKeeper Monitor Guide](https://zookeeper.apache.org/doc/r3.7.0/zookeeperMonitor.html). + +## BookKeeper + +The BookKeeper metrics are exposed under "/metrics" at port `8000`. You can change the port by updating `prometheusStatsHttpPort` +in the `bookkeeper.conf` configuration file. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| bookie_SERVER_STATUS | Gauge | The server status for bookie server.
    • 1: the bookie is running in writable mode.
    • 0: the bookie is running in readonly mode.
    | +| bookkeeper_server_ADD_ENTRY_count | Counter | The total number of ADD_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_count | Counter | The total number of READ_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_WRITE_BYTES | Counter | The total number of bytes written to the bookie. | +| bookie_READ_BYTES | Counter | The total number of bytes read from the bookie. | +| bookkeeper_server_ADD_ENTRY_REQUEST | Summary | The summary of request latency of ADD_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_REQUEST | Summary | The summary of request latency of READ_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_BookieReadThreadPool_queue_{thread_id}|Gauge|The number of requests to be processed in a read thread queue.| +| bookkeeper_server_BookieReadThreadPool_task_queued|Summary | The waiting time of a task to be processed in a read thread queue. | +| bookkeeper_server_BookieReadThreadPool_task_execution|Summary | The execution time of a task in a read thread queue.| + +### Journal metrics + +| Name | Type | Description | +|---|---|---| +| bookie_journal_JOURNAL_SYNC_count | Counter | The total number of journal fsync operations happening at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_journal_JOURNAL_QUEUE_SIZE | Gauge | The total number of requests pending in the journal queue. | +| bookie_journal_JOURNAL_FORCE_WRITE_QUEUE_SIZE | Gauge | The total number of force write (fsync) requests pending in the force-write queue. | +| bookie_journal_JOURNAL_CB_QUEUE_SIZE | Gauge | The total number of callbacks pending in the callback queue. | +| bookie_journal_JOURNAL_ADD_ENTRY | Summary | The summary of request latency of adding entries to the journal. | +| bookie_journal_JOURNAL_SYNC | Summary | The summary of fsync latency of syncing data to the journal disk. | +| bookie_journal_JOURNAL_CREATION_LATENCY| Summary | The latency created by a journal log file. | + +### Storage metrics + +| Name | Type | Description | +|---|---|---| +| bookie_ledgers_count | Gauge | The total number of ledgers stored in the bookie. | +| bookie_entries_count | Gauge | The total number of entries stored in the bookie. | +| bookie_write_cache_size | Gauge | The bookie write cache size (in bytes). | +| bookie_read_cache_size | Gauge | The bookie read cache size (in bytes). | +| bookie_DELETED_LEDGER_COUNT | Counter | The total number of ledgers deleted since the bookie has started. | +| bookie_ledger_writable_dirs | Gauge | The number of writable directories in the bookie. | +| bookie_flush | Gauge| The table flush latency of bookie memory. | +| bookie_throttled_write_requests | Counter | The number of write requests to be throttled. | + +## Broker + +The broker metrics are exposed under "/metrics" at port `8080`. You can change the port by updating `webServicePort` to a different port +in the `broker.conf` configuration file. + +All the metrics exposed by a broker are labelled with `cluster=${pulsar_cluster}`. The name of Pulsar cluster is the value of `${pulsar_cluster}`, which you have configured in the `broker.conf` file. + +The following metrics are available for broker: + +- [ZooKeeper](#zookeeper) + - [Server metrics](#server-metrics) + - [Request metrics](#request-metrics) +- [BookKeeper](#bookkeeper) + - [Server metrics](#server-metrics-1) + - [Journal metrics](#journal-metrics) + - [Storage metrics](#storage-metrics) +- [Broker](#broker) + - [Namespace metrics](#namespace-metrics) + - [Replication metrics](#replication-metrics) + - [Topic metrics](#topic-metrics) + - [Replication metrics](#replication-metrics-1) + - [ManagedLedgerCache metrics](#managedledgercache-metrics) + - [ManagedLedger metrics](#managedledger-metrics) + - [LoadBalancing metrics](#loadbalancing-metrics) + - [BundleUnloading metrics](#bundleunloading-metrics) + - [BundleSplit metrics](#bundlesplit-metrics) + - [Subscription metrics](#subscription-metrics) + - [Consumer metrics](#consumer-metrics) + - [Managed ledger bookie client metrics](#managed-ledger-bookie-client-metrics) + - [Token metrics](#token-metrics) + - [Authentication metrics](#authentication-metrics) + - [Connection metrics](#connection-metrics) + - [Jetty metrics](#jetty-metrics) +- [Pulsar Functions](#pulsar-functions) +- [Proxy](#proxy) +- [Pulsar SQL Worker](#pulsar-sql-worker) +- [Pulsar transaction](#pulsar-transaction) + +### BookKeeper client metrics + +All the BookKeeper client metric are labelled with the following label: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. + +| Name | Type | Description | +|---|---|---| +| pulsar_managedLedger_client_bookkeeper_client_BOOKIE_QUARANTINE | Counter | The number of bookie clients to be quarantined.

    If you want to expose this metric, set `bookkeeperClientExposeStatsToPrometheus` to `true` in the `broker.conf` file.| + +### Namespace metrics + +> Namespace metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `false`. + +All the namespace metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_topics_count | Gauge | The number of Pulsar topics of the namespace owned by this broker. | +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the namespace served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the namespace connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the namespace connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the namespace coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the namespace going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the namespace coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the namespace going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this namespace owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size of topics in the namespace owned by the broker without replicas (in bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this namespace owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this namespace offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this namespace (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this namespace (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a namespace that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a namespace that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | + +#### Replication metrics + +If a namespace is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are also labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the namespace replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the namespace replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the namespace replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the namespace replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the namespace replicating to remote cluster (messages). | +| pulsar_replication_rate_expired | Gauge | Total rate of messages expired (messages/second). | +| pulsar_replication_connected_count | Gauge | The count of replication-subscriber up and running to replicate to remote cluster. | +| pulsar_replication_delay_in_seconds | Gauge | Time in seconds from the time a message was produced to the time when it is about to be replicated. | + + +### Topic metrics + +> Topic metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the topic metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the topic served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the topic connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the topic connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the topic coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the topic going out from this broker (messages/second). | +| pulsar_publish_rate_limit_times | Gauge | The number of times the publish rate limit is triggered. | +| pulsar_throughput_in | Gauge | The total throughput of the topic coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the topic going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this topic owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size of topics in the namespace owned by the broker without replicas (in bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this topic owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this topic offloaded to the tiered storage (bytes). | +| pulsar_storage_backlog_quota_limit | Gauge | The total amount of the data in this topic that limit the backlog quota (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this topic (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this topic (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a topic that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a topic that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | +| pulsar_in_bytes_total | Counter | The total number of messages in bytes received for this topic. | +| pulsar_in_messages_total | Counter | The total number of messages received for this topic. | +| pulsar_out_bytes_total | Counter | The total number of messages in bytes read from this topic. | +| pulsar_out_messages_total | Counter | The total number of messages read from this topic. | +| pulsar_compaction_removed_event_count | Gauge | The total number of removed events of the compaction. | +| pulsar_compaction_succeed_count | Gauge | The total number of successes of the compaction. | +| pulsar_compaction_failed_count | Gauge | The total number of failures of the compaction. | +| pulsar_compaction_duration_time_in_mills | Gauge | The duration time of the compaction. | +| pulsar_compaction_read_throughput | Gauge | The read throughput of the compaction. | +| pulsar_compaction_write_throughput | Gauge | The write throughput of the compaction. | +| pulsar_compaction_latency_le_* | Histogram | The compaction latency with given quantile.
    Available thresholds:
    • pulsar_compaction_latency_le_0_5: <= 0.5ms
    • pulsar_compaction_latency_le_1: <= 1ms
    • pulsar_compaction_latency_le_5: <= 5ms
    • pulsar_compaction_latency_le_10: <= 10ms
    • pulsar_compaction_latency_le_20: <= 20ms
    • pulsar_compaction_latency_le_50: <= 50ms
    • pulsar_compaction_latency_le_100: <= 100ms
    • pulsar_compaction_latency_le_200: <= 200ms
    • pulsar_compaction_latency_le_1000: <= 1s
    • pulsar_compaction_latency_le_overflow: > 1s
    | +| pulsar_compaction_compacted_entries_count | Gauge | The total number of the compacted entries. | +| pulsar_compaction_compacted_entries_size |Gauge | The total size of the compacted entries. | + +#### Replication metrics + +If a namespace that a topic belongs to is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the topic replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the topic replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the topic replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the topic replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the topic replicating to remote cluster (messages). | + +#### Topic lookup metrics + +| Name | Type | Description | +|---|---|---| +| pulsar_broker_load_manager_bundle_assignment | Gauge | The summary of latency of bundles ownership operations. | +| pulsar_broker_lookup | Gauge | The latency of all lookup operations. | +| pulsar_broker_lookup_redirects | Gauge | The number of lookup redirected requests. | +| pulsar_broker_lookup_answers | Gauge | The number of lookup responses (i.e. not redirected requests). | +| pulsar_broker_lookup_failures | Gauge | The number of lookup failures. | +| pulsar_broker_lookup_pending_requests | Gauge | The number of pending lookups in broker. When it is up to the threshold, new requests are rejected. | +| pulsar_broker_topic_load_pending_requests | Gauge | The load of pending topic operations. | + +### ManagedLedgerCache metrics +All the ManagedLedgerCache metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_cache_evictions | Gauge | The number of cache evictions during the last minute. | +| pulsar_ml_cache_hits_rate | Gauge | The number of cache hits per second on the broker side. | +| pulsar_ml_cache_hits_throughput | Gauge | The amount of data is retrieved from the cache on the broker side (in byte/s). | +| pulsar_ml_cache_misses_rate | Gauge | The number of cache misses per second on the broker side. | +| pulsar_ml_cache_misses_throughput | Gauge | The amount of data is not retrieved from the cache on the broker side (in byte/s). | +| pulsar_ml_cache_pool_active_allocations | Gauge | The number of currently active allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_huge | Gauge | The number of currently active huge allocation in direct arena | +| pulsar_ml_cache_pool_active_allocations_normal | Gauge | The number of currently active normal allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_small | Gauge | The number of currently active small allocations in direct arena | +| pulsar_ml_cache_pool_allocated | Gauge | The total allocated memory of chunk lists in direct arena | +| pulsar_ml_cache_pool_used | Gauge | The total used memory of chunk lists in direct arena | +| pulsar_ml_cache_used_size | Gauge | The size in byte used to store the entries payloads | +| pulsar_ml_count | Gauge | The number of currently opened managed ledgers | + +### ManagedLedger metrics +All the managedLedger metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- namespace: namespace=${pulsar_namespace}. ${pulsar_namespace} is the namespace name. +- quantile: quantile=${quantile}. Quantile is only for `Histogram` type metric, and represents the threshold for given Buckets. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_AddEntryBytesRate | Gauge | The bytes/s rate of messages added | +| pulsar_ml_AddEntryWithReplicasBytesRate | Gauge | The bytes/s rate of messages added with replicas | +| pulsar_ml_AddEntryErrors | Gauge | The number of addEntry requests that failed | +| pulsar_ml_AddEntryLatencyBuckets | Histogram | The latency of adding a ledger entry with a given quantile (threshold), including time spent on waiting in queue on the broker side.
    Available quantile:
    • quantile="0.0_0.5" is AddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is AddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is AddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is AddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is AddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is AddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is AddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is AddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is AddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_AddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the AddEntryLatency is longer than 1 second | +| pulsar_ml_AddEntryMessagesRate | Gauge | The msg/s rate of messages added | +| pulsar_ml_AddEntrySucceed | Gauge | The number of addEntry requests that succeeded | +| pulsar_ml_EntrySizeBuckets | Histogram | The added entry size of a ledger with a given quantile.
    Available quantile:
    • quantile="0.0_128.0" is EntrySize between (0byte, 128byte]
    • quantile="128.0_512.0" is EntrySize between (128byte, 512byte]
    • quantile="512.0_1024.0" is EntrySize between (512byte, 1KB]
    • quantile="1024.0_2048.0" is EntrySize between (1KB, 2KB]
    • quantile="2048.0_4096.0" is EntrySize between (2KB, 4KB]
    • quantile="4096.0_16384.0" is EntrySize between (4KB, 16KB]
    • quantile="16384.0_102400.0" is EntrySize between (16KB, 100KB]
    • quantile="102400.0_1232896.0" is EntrySize between (100KB, 1MB]
    | +| pulsar_ml_EntrySizeBuckets_OVERFLOW |Gauge | The number of times the EntrySize is larger than 1MB | +| pulsar_ml_LedgerSwitchLatencyBuckets | Histogram | The ledger switch latency with a given quantile.
    Available quantile:
    • quantile="0.0_0.5" is EntrySize between (0ms, 0.5ms]
    • quantile="0.5_1.0" is EntrySize between (0.5ms, 1ms]
    • quantile="1.0_5.0" is EntrySize between (1ms, 5ms]
    • quantile="5.0_10.0" is EntrySize between (5ms, 10ms]
    • quantile="10.0_20.0" is EntrySize between (10ms, 20ms]
    • quantile="20.0_50.0" is EntrySize between (20ms, 50ms]
    • quantile="50.0_100.0" is EntrySize between (50ms, 100ms]
    • quantile="100.0_200.0" is EntrySize between (100ms, 200ms]
    • quantile="200.0_1000.0" is EntrySize between (200ms, 1000ms]
    | +| pulsar_ml_LedgerSwitchLatencyBuckets_OVERFLOW | Gauge | The number of times the ledger switch latency is longer than 1 second | +| pulsar_ml_LedgerAddEntryLatencyBuckets | Histogram | The latency for bookie client to persist a ledger entry from broker to BookKeeper service with a given quantile (threshold).
    Available quantile:
    • quantile="0.0_0.5" is LedgerAddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is LedgerAddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is LedgerAddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is LedgerAddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is LedgerAddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is LedgerAddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is LedgerAddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is LedgerAddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is LedgerAddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_LedgerAddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the LedgerAddEntryLatency is longer than 1 second | +| pulsar_ml_MarkDeleteRate | Gauge | The rate of mark-delete ops/s | +| pulsar_ml_NumberOfMessagesInBacklog | Gauge | The number of backlog messages for all the consumers | +| pulsar_ml_ReadEntriesBytesRate | Gauge | The bytes/s rate of messages read | +| pulsar_ml_ReadEntriesErrors | Gauge | The number of readEntries requests that failed | +| pulsar_ml_ReadEntriesRate | Gauge | The msg/s rate of messages read | +| pulsar_ml_ReadEntriesSucceeded | Gauge | The number of readEntries requests that succeeded | +| pulsar_ml_StoredMessagesSize | Gauge | The total size of the messages in active ledgers (accounting for the multiple copies stored) | + +### Managed cursor acknowledgment state + +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgment, you can configure the metrics for the managed cursor. + +All the cursor acknowledgment state metrics are labelled with the following labels: + +- namespace: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +- ledger_name: `ledger_name=${pulsar_ledger_name}`. `${pulsar_ledger_name}` is the ledger name. + +- cursor_name: `ledger_name=${pulsar_cursor_name}`. `${pulsar_cursor_name}` is the cursor name. + +Name |Type |Description +|---|---|--- +brk_ml_cursor_persistLedgerSucceed|Gauge|The number of acknowledgment states that is persistent to a ledger.| +brk_ml_cursor_persistLedgerErrors|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to the ledger.| +brk_ml_cursor_persistZookeeperSucceed|Gauge|The number of acknowledgment states that is persistent to ZooKeeper. +brk_ml_cursor_persistZookeeperErrors|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to ZooKeeper. +brk_ml_cursor_nonContiguousDeletedMessagesRange|Gauge|The number of non-contiguous deleted messages ranges. +brk_ml_cursor_writeLedgerSize|Gauge|The size of write to ledger. +brk_ml_cursor_writeLedgerLogicalSize|Gauge|The size of write to ledger (accounting for without replicas). +brk_ml_cursor_readLedgerSize|Gauge|The size of read from ledger. + +### LoadBalancing metrics +All the loadbalancing metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- broker: broker=${broker}. ${broker} is the IP address of the broker +- metric: metric="loadBalancing". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bandwidth_in_usage | Gauge | The broker inbound bandwith usage (in percent). | +| pulsar_lb_bandwidth_out_usage | Gauge | The broker outbound bandwith usage (in percent). | +| pulsar_lb_cpu_usage | Gauge | The broker cpu usage (in percent). | +| pulsar_lb_directMemory_usage | Gauge | The broker process direct memory usage (in percent). | +| pulsar_lb_memory_usage | Gauge | The broker process memory usage (in percent). | + +#### BundleUnloading metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundleUnloading". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_unload_broker_count | Counter | Unload broker count in this bundle unloading | +| pulsar_lb_unload_bundle_count | Counter | Bundle unload count in this bundle unloading | + +#### BundleSplit metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundlesSplit". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bundles_split_count | Counter | bundle split count in this bundle splitting check interval | + +#### Bundle metrics +All the bundle metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- broker: broker=${broker}. ${broker} is the IP address of the broker +- bundle: bundle=${bundle}. ${bundle} is the bundle range on this broker +- metric: metric="bundle". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_bundle_msg_rate_in | Gauge | The total message rate coming into the topics in this bundle (messages/second). | +| pulsar_bundle_msg_rate_out | Gauge | The total message rate going out from the topics in this bundle (messages/second). | +| pulsar_bundle_topics_count | Gauge | The topic count in this bundle. | +| pulsar_bundle_consumer_count | Gauge | The consumer count of the topics in this bundle. | +| pulsar_bundle_producer_count | Gauge | The producer count of the topics in this bundle. | +| pulsar_bundle_msg_throughput_in | Gauge | The total throughput coming into the topics in this bundle (bytes/second). | +| pulsar_bundle_msg_throughput_out | Gauge | The total throughput going out from the topics in this bundle (bytes/second). | + +### Subscription metrics + +> Subscription metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the subscription metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscription_back_log | Gauge | The total backlog of a subscription (entries). | +| pulsar_subscription_back_log_no_delayed | Gauge | The backlog of a subscription that do not contain the delay messages (entries). | +| pulsar_subscription_delayed | Gauge | The total number of messages are delayed to be dispatched for a subscription (messages). | +| pulsar_subscription_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_subscription_unacked_messages | Gauge | The total number of unacknowledged messages of a subscription (messages). | +| pulsar_subscription_blocked_on_unacked_messages | Gauge | Indicate whether a subscription is blocked on unacknowledged messages or not.
    • 1 means the subscription is blocked on waiting unacknowledged messages to be acked.
    • 0 means the subscription is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_subscription_msg_rate_out | Gauge | The total message dispatch rate for a subscription (messages/second). | +| pulsar_subscription_msg_throughput_out | Gauge | The total message dispatch throughput for a subscription (bytes/second). | + +### Consumer metrics + +> Consumer metrics are only exposed when both `exposeTopicLevelMetricsInPrometheus` and `exposeConsumerLevelMetricsInPrometheus` are set to `true`. + +All the consumer metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. +- *consumer_name*: `consumer_name=${consumer_name}`. `${consumer_name}` is the topic consumer name. +- *consumer_id*: `consumer_id=${consumer_id}`. `${consumer_id}` is the topic consumer id. + +| Name | Type | Description | +|---|---|---| +| pulsar_consumer_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_consumer_unacked_messages | Gauge | The total number of unacknowledged messages of a consumer (messages). | +| pulsar_consumer_blocked_on_unacked_messages | Gauge | Indicate whether a consumer is blocked on unacknowledged messages or not.
    • 1 means the consumer is blocked on waiting unacknowledged messages to be acked.
    • 0 means the consumer is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_consumer_msg_rate_out | Gauge | The total message dispatch rate for a consumer (messages/second). | +| pulsar_consumer_msg_throughput_out | Gauge | The total message dispatch throughput for a consumer (bytes/second). | +| pulsar_consumer_available_permits | Gauge | The available permits for for a consumer. | + +### Managed ledger bookie client metrics + +All the managed ledger bookie client metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_completed_tasks_* | Gauge | The number of tasks the scheduler executor execute completed.
    The number of metrics determined by the scheduler executor thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_queue_* | Gauge | The number of tasks queued in the scheduler executor's queue.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_total_tasks_* | Gauge | The total number of tasks the scheduler executor received.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_execution | Summary | The scheduler task execution latency calculated in milliseconds. | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_queued | Summary | The scheduler task queued latency calculated in milliseconds. | + +### Token metrics + +All the token metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +|---|---|---| +| pulsar_expired_token_count | Counter | The number of expired tokens in Pulsar. | +| pulsar_expiring_token_minutes | Histogram | The remaining time of expiring tokens in minutes. | + +### Authentication metrics + +All the authentication metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *provider_name*: `provider_name=${provider_name}`. `${provider_name}` is the class name of the authentication provider. +- *auth_method*: `auth_method=${auth_method}`. `${auth_method}` is the authentication method of the authentication provider. +- *reason*: `reason=${reason}`. `${reason}` is the reason for failing authentication operation. (This label is only for `pulsar_authentication_failures_count`.) + +| Name | Type | Description | +|---|---|---| +| pulsar_authentication_success_count| Counter | The number of successful authentication operations. | +| pulsar_authentication_failures_count | Counter | The number of failing authentication operations. | + +### Connection metrics + +All the connection metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *broker*: `broker=${advertised_address}`. `${advertised_address}` is the advertised address of the broker. +- *metric*: `metric=${metric}`. `${metric}` is the connection metric collective name. + +| Name | Type | Description | +|---|---|---| +| pulsar_active_connections| Gauge | The number of active connections. | +| pulsar_connection_created_total_count | Gauge | The total number of connections. | +| pulsar_connection_create_success_count | Gauge | The number of successfully created connections. | +| pulsar_connection_create_fail_count | Gauge | The number of failed connections. | +| pulsar_connection_closed_total_count | Gauge | The total number of closed connections. | +| pulsar_broker_throttled_connections | Gauge | The number of throttled connections. | +| pulsar_broker_throttled_connections_global_limit | Gauge | The number of throttled connections because of per-connection limit. | + +### Jetty metrics + +> For a functions-worker running separately from brokers, its Jetty metrics are only exposed when `includeStandardPrometheusMetrics` is set to `true`. + +All the jetty metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +|---|---|---| +| jetty_requests_total | Counter | Number of requests. | +| jetty_requests_active | Gauge | Number of requests currently active. | +| jetty_requests_active_max | Gauge | Maximum number of requests that have been active at once. | +| jetty_request_time_max_seconds | Gauge | Maximum time spent handling requests. | +| jetty_request_time_seconds_total | Counter | Total time spent in all request handling. | +| jetty_dispatched_total | Counter | Number of dispatches. | +| jetty_dispatched_active | Gauge | Number of dispatches currently active. | +| jetty_dispatched_active_max | Gauge | Maximum number of active dispatches being handled. | +| jetty_dispatched_time_max | Gauge | Maximum time spent in dispatch handling. | +| jetty_dispatched_time_seconds_total | Counter | Total time spent in dispatch handling. | +| jetty_async_requests_total | Counter | Total number of async requests. | +| jetty_async_requests_waiting | Gauge | Currently waiting async requests. | +| jetty_async_requests_waiting_max | Gauge | Maximum number of waiting async requests. | +| jetty_async_dispatches_total | Counter | Number of requested that have been asynchronously dispatched. | +| jetty_expires_total | Counter | Number of async requests requests that have expired. | +| jetty_responses_total | Counter | Number of responses, labeled by status code. The `code` label can be "1xx", "2xx", "3xx", "4xx", or "5xx". | +| jetty_stats_seconds | Gauge | Time in seconds stats have been collected for. | +| jetty_responses_bytes_total | Counter | Total number of bytes across all responses. | + +## Pulsar Functions + +All the Pulsar Functions metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_function_processed_successfully_total | Counter | The total number of messages processed successfully. | +| pulsar_function_processed_successfully_total_1min | Counter | The total number of messages processed successfully in the last 1 minute. | +| pulsar_function_system_exceptions_total | Counter | The total number of system exceptions. | +| pulsar_function_system_exceptions_total_1min | Counter | The total number of system exceptions in the last 1 minute. | +| pulsar_function_user_exceptions_total | Counter | The total number of user exceptions. | +| pulsar_function_user_exceptions_total_1min | Counter | The total number of user exceptions in the last 1 minute. | +| pulsar_function_process_latency_ms | Summary | The process latency in milliseconds. | +| pulsar_function_process_latency_ms_1min | Summary | The process latency in milliseconds in the last 1 minute. | +| pulsar_function_last_invocation | Gauge | The timestamp of the last invocation of the function. | +| pulsar_function_received_total | Counter | The total number of messages received from source. | +| pulsar_function_received_total_1min | Counter | The total number of messages received from source in the last 1 minute. | +pulsar_function_user_metric_ | Summary|The user-defined metrics. + +## Connectors + +All the Pulsar connector metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +Connector metrics contain **source** metrics and **sink** metrics. + +- **Source** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_source_written_total|Counter|The total number of records written to a Pulsar topic. + pulsar_source_written_total_1min|Counter|The total number of records written to a Pulsar topic in the last 1 minute. + pulsar_source_received_total|Counter|The total number of records received from source. + pulsar_source_received_total_1min|Counter|The total number of records received from source in the last 1 minute. + pulsar_source_last_invocation|Gauge|The timestamp of the last invocation of the source. + pulsar_source_source_exception|Gauge|The exception from a source. + pulsar_source_source_exceptions_total|Counter|The total number of source exceptions. + pulsar_source_source_exceptions_total_1min |Counter|The total number of source exceptions in the last 1 minute. + pulsar_source_system_exception|Gauge|The exception from system code. + pulsar_source_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_source_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_source_user_metric_ | Summary|The user-defined metrics. + +- **Sink** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_sink_written_total|Counter| The total number of records processed by a sink. + pulsar_sink_written_total_1min|Counter| The total number of records processed by a sink in the last 1 minute. + pulsar_sink_received_total_1min|Counter| The total number of messages that a sink has received from Pulsar topics in the last 1 minute. + pulsar_sink_received_total|Counter| The total number of records that a sink has received from Pulsar topics. + pulsar_sink_last_invocation|Gauge|The timestamp of the last invocation of the sink. + pulsar_sink_sink_exception|Gauge|The exception from a sink. + pulsar_sink_sink_exceptions_total|Counter|The total number of sink exceptions. + pulsar_sink_sink_exceptions_total_1min |Counter|The total number of sink exceptions in the last 1 minute. + pulsar_sink_system_exception|Gauge|The exception from system code. + pulsar_sink_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_sink_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_sink_user_metric_ | Summary|The user-defined metrics. + +## Proxy + +All the proxy metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *kubernetes_pod_name*: `kubernetes_pod_name=${kubernetes_pod_name}`. `${kubernetes_pod_name}` is the Kubernetes pod name. + +| Name | Type | Description | +|---|---|---| +| pulsar_proxy_active_connections | Gauge | Number of connections currently active in the proxy. | +| pulsar_proxy_new_connections | Counter | Counter of connections being opened in the proxy. | +| pulsar_proxy_rejected_connections | Counter | Counter for connections rejected due to throttling. | +| pulsar_proxy_binary_ops | Counter | Counter of proxy operations. | +| pulsar_proxy_binary_bytes | Counter | Counter of proxy bytes. | + +## Pulsar SQL Worker + +| Name | Type | Description | +|---|---|---| +| split_bytes_read | Counter | Number of bytes read from BookKeeper. | +| split_num_messages_deserialized | Counter | Number of messages deserialized. | +| split_num_record_deserialized | Counter | Number of records deserialized. | +| split_bytes_read_per_query | Summary | Total number of bytes read per query. | +| split_entry_deserialize_time | Summary | Time spent on derserializing entries. | +| split_entry_deserialize_time_per_query | Summary | Time spent on derserializing entries per query. | +| split_entry_queue_dequeue_wait_time | Summary | Time spend on waiting to get entry from entry queue because it is empty. | +| split_entry_queue_dequeue_wait_time_per_query | Summary | Total time spent on waiting to get entry from entry queue per query. | +| split_message_queue_dequeue_wait_time_per_query | Summary | Time spent on waiting to dequeue from message queue because is is empty per query. | +| split_message_queue_enqueue_wait_time | Summary | Time spent on waiting for message queue enqueue because the message queue is full. | +| split_message_queue_enqueue_wait_time_per_query | Summary | Time spent on waiting for message queue enqueue because the message queue is full per query. | +| split_num_entries_per_batch | Summary | Number of entries per batch. | +| split_num_entries_per_query | Summary | Number of entries per query. | +| split_num_messages_deserialized_per_entry | Summary | Number of messages deserialized per entry. | +| split_num_messages_deserialized_per_query | Summary | Number of messages deserialized per query. | +| split_read_attempts | Summary | Number of read attempts (fail if queues are full). | +| split_read_attempts_per_query | Summary | Number of read attempts per query. | +| split_read_latency_per_batch | Summary | Latency of reads per batch. | +| split_read_latency_per_query | Summary | Total read latency per query. | +| split_record_deserialize_time | Summary | Time spent on deserializing message to record. For example, Avro, JSON, and so on. | +| split_record_deserialize_time_per_query | Summary | Time spent on deserializing message to record per query. | +| split_total_execution_time | Summary | The total execution time. | + +## Pulsar transaction + +All the transaction metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *coordinator_id*: `coordinator_id=${coordinator_id}`. `${coordinator_id}` is the coordinator id. + +| Name | Type | Description | +|---|---|---| +| pulsar_txn_active_count | Gauge | Number of active transactions. | +| pulsar_txn_created_count | Counter | Number of created transactions. | +| pulsar_txn_committed_count | Counter | Number of committed transactions. | +| pulsar_txn_aborted_count | Counter | Number of aborted transactions of this coordinator. | +| pulsar_txn_timeout_count | Counter | Number of timeout transactions. | +| pulsar_txn_append_log_count | Counter | Number of append transaction logs. | +| pulsar_txn_execution_latency_le_* | Histogram | Transaction execution latency.
    Available latencies are as below:
    • latency="10" is TransactionExecutionLatency between (0ms, 10ms]
    • latency="20" is TransactionExecutionLatency between (10ms, 20ms]
    • latency="50" is TransactionExecutionLatency between (20ms, 50ms]
    • latency="100" is TransactionExecutionLatency between (50ms, 100ms]
    • latency="500" is TransactionExecutionLatency between (100ms, 500ms]
    • latency="1000" is TransactionExecutionLatency between (500ms, 1000ms]
    • latency="5000" is TransactionExecutionLatency between (1s, 5s]
    • latency="15000" is TransactionExecutionLatency between (5s, 15s]
    • latency="30000" is TransactionExecutionLatency between (15s, 30s]
    • latency="60000" is TransactionExecutionLatency between (30s, 60s]
    • latency="300000" is TransactionExecutionLatency between (1m,5m]
    • latency="1500000" is TransactionExecutionLatency between (5m,15m]
    • latency="3000000" is TransactionExecutionLatency between (15m,30m]
    • latency="overflow" is TransactionExecutionLatency between (30m,∞]
    | diff --git a/site2/website/versioned_docs/version-2.10.x/reference-pulsar-admin.md b/site2/website/versioned_docs/version-2.10.x/reference-pulsar-admin.md new file mode 100644 index 0000000000000..5ec74a86e432b --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-pulsar-admin.md @@ -0,0 +1,3394 @@ +--- +id: pulsar-admin +title: Pulsar admin CLI +sidebar_label: "Pulsar Admin CLI" +original_id: pulsar-admin +--- + +> **Important** +> +> This page is deprecated and not updated anymore. For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](/tools/pulsar-admin/) + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage + +```bash + +$ pulsar-admin command + +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `functions-worker` +* `namespaces` +* `ns-isolation-policy` +* `sources` + + For more information, see [here](io-cli.md#sources) +* `sinks` + + For more information, see [here](io-cli.md#sinks) +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash + +$ pulsar-admin broker-stats subcommand + +``` + +Subcommands +* `allocator-stats` +* `topics(destinations)` +* `mbeans` +* `monitoring-metrics` +* `load-report` + + +### `allocator-stats` + +Dump allocator stats + +Usage + +```bash + +$ pulsar-admin broker-stats allocator-stats allocator-name + +``` + +### `topics(destinations)` + +Dump topic stats + +Usage + +```bash + +$ pulsar-admin broker-stats topics options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage + +```bash + +$ pulsar-admin broker-stats mbeans options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage + +```bash + +$ pulsar-admin broker-stats monitoring-metrics options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `load-report` + +Dump broker load-report + +Usage + +```bash + +$ pulsar-admin broker-stats load-report + +``` + +## `brokers` + +Operations about brokers + +```bash + +$ pulsar-admin brokers subcommand + +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` +* `get-runtime-config` +* `healthcheck` + +### `list` +List active brokers of the cluster + +Usage + +```bash + +$ pulsar-admin brokers list cluster-name + +``` + +### `leader-broker` +Get the information of the leader broker + +Usage + +```bash + +$ pulsar-admin brokers leader-broker + +``` + +### `namespaces` +List namespaces owned by the broker + +Usage + +```bash + +$ pulsar-admin brokers namespaces cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage + +```bash + +$ pulsar-admin brokers update-dynamic-config options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage + +```bash + +$ pulsar-admin brokers list-dynamic-config + +``` + +### `delete-dynamic-config` +Delete dynamic-serviceConfiguration of broker + +Usage + +```bash + +$ pulsar-admin brokers delete-dynamic-config options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| + + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-all-dynamic-config + +``` + +### `get-internal-config` +Get internal configuration information + +Usage + +```bash + +$ pulsar-admin brokers get-internal-config + +``` + +### `get-runtime-config` +Get runtime configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-runtime-config + +``` + +### `healthcheck` +Run a health check against the broker + +Usage + +```bash + +$ pulsar-admin brokers healthcheck + +``` + +## `clusters` +Operations about clusters + +Usage + +```bash + +$ pulsar-admin clusters subcommand + +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` +* `get-peer-clusters` +* `get-failure-domain` +* `create-failure-domain` +* `update-failure-domain` +* `delete-failure-domain` +* `list-failure-domains` + + +### `get` +Get the configuration data for the specified cluster + +Usage + +```bash + +$ pulsar-admin clusters get cluster-name + +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin clusters create cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage + +```bash + +$ pulsar-admin clusters update cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage + +```bash + +$ pulsar-admin clusters delete cluster-name + +``` + +### `list` +List the existing clusters + +Usage + +```bash + +$ pulsar-admin clusters list + +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage + +```bash + +$ pulsar-admin clusters update-peer-clusters cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--peer-clusters`|Comma separated peer cluster names (Pass empty string "" to delete list)|| + +### `get-peer-clusters` +Get list of peer clusters + +Usage + +```bash + +$ pulsar-admin clusters get-peer-clusters + +``` + +### `get-failure-domain` +Get the configuration brokers of a failure domain + +Usage + +```bash + +$ pulsar-admin clusters get-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `create-failure-domain` +Create a new failure domain for a cluster (updates it if already created) + +Usage + +```bash + +$ pulsar-admin clusters create-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `update-failure-domain` +Update failure domain for a cluster (creates a new one if not exist) + +Usage + +```bash + +$ pulsar-admin clusters update-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `delete-failure-domain` +Delete an existing failure domain + +Usage + +```bash + +$ pulsar-admin clusters delete-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `list-failure-domains` +List the existing failure domains for a cluster + +Usage + +```bash + +$ pulsar-admin clusters list-failure-domains cluster-name + +``` + +## `functions` + +A command-line interface for Pulsar Functions + +Usage + +```bash + +$ pulsar-admin functions subcommand + +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `restart` +* `stop` +* `start` +* `status` +* `stats` +* `list` +* `querystate` +* `putstate` +* `trigger` + + +### `localrun` +Run the Pulsar Function locally (rather than deploying it to the Pulsar cluster) + + +Usage + +```bash + +$ pulsar-admin functions localrun options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--broker-service-url `|The URL of the Pulsar broker|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--client-auth-params`|Client authentication param|| +|`--client-auth-plugin`|Client authentication plugin using which function-process can connect to broker|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--hostname-verification-enabled`|Enable hostname verification|false| +|`--instance-id-offset`|Start the instanceIds from this offset|0| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--state-storage-service-url`|The URL for the state storage service. By default, it it set to the service URL of the Apache BookKeeper. This service URL must be added manually when the Pulsar Function runs locally. || +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed successfully are sent|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--tls-allow-insecure`|Allow insecure tls connection|false| +|`--tls-trust-cert-path`|The tls trust cert file path|| +|`--use-tls`|Use tls connection|false| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `create` +Create a Pulsar Function in cluster mode (i.e. deploy it on a Pulsar cluster) + +Usage + +``` + +$ pulsar-admin functions create options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `delete` +Delete a Pulsar Function that's running on a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions delete options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `update` +Update a Pulsar Function that's been deployed to a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions update options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `get` +Fetch information about a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions get options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `restart` +Restart function instance + +Usage + +```bash + +$ pulsar-admin functions restart options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (restart all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stop` +Stops function instance + +Usage + +```bash + +$ pulsar-admin functions stop options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (stop all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `start` +Starts a stopped function instance + +Usage + +```bash + +$ pulsar-admin functions start options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (start all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `status` +Check the current status of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions status options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-status of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stats` +Get the current stats of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions stats options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-stats of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + +### `list` +List all of the Pulsar Functions running under a specific tenant and namespace + +Usage + +```bash + +$ pulsar-admin functions list options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `querystate` +Fetch the current state associated with a Pulsar Function running in cluster mode + +Usage + +```bash + +$ pulsar-admin functions querystate options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`-w`, `--watch`|Watch for changes in the value associated with a key for a Pulsar Function|false| + +### `putstate` +Put a key/value pair to the state associated with a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions putstate options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the Pulsar Function|| +|`--name`|The name of a Pulsar Function|| +|`--namespace`|The namespace of a Pulsar Function|| +|`--tenant`|The tenant of a Pulsar Function|| +|`-s`, `--state`|The FunctionState that needs to be put|| + +### `trigger` +Triggers the specified Pulsar Function with a supplied value + +Usage + +```bash + +$ pulsar-admin functions trigger options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`--topic`|The specific topic name that the function consumes from that you want to inject the data to|| +|`--trigger-file`|The path to the file that contains the data with which you'd like to trigger the function|| +|`--trigger-value`|The value with which you want to trigger the function|| + + +## `functions-worker` +Operations to collect function-worker statistics + +```bash + +$ pulsar-admin functions-worker subcommand + +``` + +Subcommands + +* `function-stats` +* `get-cluster` +* `get-cluster-leader` +* `get-function-assignments` +* `monitoring-metrics` + +### `function-stats` + +Dump all functions stats running on this broker + +Usage + +```bash + +$ pulsar-admin functions-worker function-stats + +``` + +### `get-cluster` + +Get all workers belonging to this cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster + +``` + +### `get-cluster-leader` + +Get the leader of the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster-leader + +``` + +### `get-function-assignments` + +Get the assignments of the functions across the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-function-assignments + +``` + +### `monitoring-metrics` + +Dump metrics for Monitoring + +Usage + +```bash + +$ pulsar-admin functions-worker monitoring-metrics + +``` + +## `namespaces` + +Operations for managing namespaces + +```bash + +$ pulsar-admin namespaces subcommand + +``` + +Subcommands +* `list` +* `topics` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `set-auto-topic-creation` +* `remove-auto-topic-creation` +* `set-auto-subscription-creation` +* `remove-auto-subscription-creation` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `grant-subscription-permission` +* `revoke-subscription-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-anti-affinity-group` +* `set-anti-affinity-group` +* `get-anti-affinity-namespaces` +* `delete-anti-affinity-group` +* `get-retention` +* `set-retention` +* `unload` +* `split-bundle` +* `set-dispatch-rate` +* `get-dispatch-rate` +* `set-replicator-dispatch-rate` +* `get-replicator-dispatch-rate` +* `set-subscribe-rate` +* `get-subscribe-rate` +* `set-subscription-dispatch-rate` +* `get-subscription-dispatch-rate` +* `clear-backlog` +* `unsubscribe` +* `set-encryption-required` +* `set-delayed-delivery` +* `get-delayed-delivery` +* `set-subscription-auth-mode` +* `get-max-producers-per-topic` +* `set-max-producers-per-topic` +* `get-max-consumers-per-topic` +* `set-max-consumers-per-topic` +* `get-max-consumers-per-subscription` +* `set-max-consumers-per-subscription` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` +* `get-offload-deletion-lag` +* `set-offload-deletion-lag` +* `clear-offload-deletion-lag` +* `get-schema-autoupdate-strategy` +* `set-schema-autoupdate-strategy` +* `set-offload-policies` +* `get-offload-policies` +* `set-max-subscriptions-per-topic` +* `get-max-subscriptions-per-topic` +* `remove-max-subscriptions-per-topic` + + +### `list` +Get the namespaces for a tenant + +Usage + +```bash + +$ pulsar-admin namespaces list tenant-name + +``` + +### `topics` +Get the list of topics for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces topics tenant/namespace + +``` + +### `policies` +Get the configuration policies of a namespace + +Usage + +```bash + +$ pulsar-admin namespaces policies tenant/namespace + +``` + +### `create` +Create a new namespace + +Usage + +```bash + +$ pulsar-admin namespaces create tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundles`|The number of bundles to activate|0| +|`-c`, `--clusters`|List of clusters this namespace will be assigned|| + + +### `delete` +Deletes a namespace. The namespace needs to be empty + +Usage + +```bash + +$ pulsar-admin namespaces delete tenant/namespace + +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-deduplication tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + +### `set-auto-topic-creation` +Enable or disable autoTopicCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-topic-creation tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoTopicCreation on namespace|false| +|`--disable`, `-d`|Disable allowAutoTopicCreation on namespace|false| +|`--type`, `-t`|Type of topic to be auto-created. Possible values: (partitioned, non-partitioned)|non-partitioned| +|`--num-partitions`, `-n`|Default number of partitions of topic to be auto-created, applicable to partitioned topics only|| + +### `remove-auto-topic-creation` +Remove override of autoTopicCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-topic-creation tenant/namespace + +``` + +### `set-auto-subscription-creation` +Enable autoSubscriptionCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-subscription-creation tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoSubscriptionCreation on namespace|false| + +### `remove-auto-subscription-creation` +Remove override of autoSubscriptionCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-subscription-creation tenant/namespace + +``` + +### `permissions` +Get the permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces permissions tenant/namespace + +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces grant-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces revoke-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| + +### `grant-subscription-permission` +Grant permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces grant-subscription-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--roles`|The client roles to which to grant the permissions (comma separated roles)|| +|`--subscription`|The subscription name for which permission will be granted to roles|| + +### `revoke-subscription-permission` +Revoke permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces revoke-subscription-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| +|`--subscription`|The subscription name for which permission will be revoked to roles|| + +### `set-clusters` +Set replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-clusters tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-clusters tenant/namespace + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-backlog-quotas tenant/namespace + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-backlog-quota tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-lt`, `--limitTime`|Time limit in second, non-positive number for disabling time limit. (for example 3600 for 1 hour)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| +|`-t`, `--type`|Backlog quota type to set. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Example + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limit 2G \ +--policy producer_request_hold + +``` + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--type`|Backlog quota type to remove. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin namespaces remove-backlog-quota tenant/namespace + +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-persistence tenant/namespace + +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-persistence tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorum`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-message-ttl tenant/namespace + +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-message-ttl tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds. When the value is set to `0`, TTL is disabled. TTL is disabled by default. |0| + +### `remove-message-ttl` +Remove the message TTL for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-message-ttl tenant/namespace + +``` + +### `get-anti-affinity-group` +Get Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-group tenant/namespace + +``` + +### `set-anti-affinity-group` +Set Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-anti-affinity-group tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-g`, `--group`|Anti-affinity group name|| + +### `get-anti-affinity-namespaces` +Get Anti-affinity namespaces grouped with the given anti-affinity group name + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-namespaces options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--cluster`|Cluster name|| +|`-g`, `--group`|Anti-affinity group name|| +|`-p`, `--tenant`|Tenant is only used for authorization. Client has to be admin of any of the tenant to access this api|| + +### `delete-anti-affinity-group` +Remove Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces delete-anti-affinity-group tenant/namespace + +``` + +### `get-retention` +Get the retention policy that is applied to each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-retention tenant/namespace + +``` + +### `set-retention` +Set the retention policy for each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-retention tenant/namespace + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T) for each topic in the namespace. 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage + +```bash + +$ pulsar-admin namespaces unload tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| + +### `split-bundle` +Split a namespace-bundle from the current serving broker + +Usage + +```bash + +$ pulsar-admin namespaces split-bundle tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-u`, `--unload`|Unload newly split bundles after splitting old bundle|false| + +### `set-dispatch-rate` +Set message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-dispatch-rate` +Get configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-dispatch-rate tenant/namespace + +``` + +### `set-replicator-dispatch-rate` +Set replicator message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-replicator-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-replicator-dispatch-rate` +Get replicator configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-replicator-dispatch-rate tenant/namespace + +``` + +### `set-subscribe-rate` +Set subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscribe-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-sr`, `--subscribe-rate`|The subscribe rate (default -1 will be overwrite if not passed)|-1| +|`-st`, `--subscribe-rate-period`|The subscribe rate period in second type (default 30 second will be overwrite if not passed)|30| + +### `get-subscribe-rate` +Get configured subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-subscribe-rate tenant/namespace + +``` + +### `set-subscription-dispatch-rate` +Set subscription message-dispatch-rate for all subscription of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--sub-msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-subscription-dispatch-rate` +Get subscription configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-subscription-dispatch-rate tenant/namespace + +``` + +### `clear-backlog` +Clear the backlog for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-backlog tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-force`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces unsubscribe tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-s`, `--sub`|The subscription name|| + +### `set-encryption-required` +Enable or disable message encryption required for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-encryption-required tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable message encryption required|false| +|`-e`, `--enable`|Enable message encryption required|false| + +### `set-delayed-delivery` +Set the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-delayed-delivery tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable delayed delivery messages|false| +|`-e`, `--enable`|Enable delayed delivery messages|false| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `get-delayed-delivery` +Get the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-delayed-delivery-time tenant/namespace + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `set-subscription-auth-mode` +Set subscription auth mode on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-auth-mode tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-m`, `--subscription-auth-mode`|Subscription authorization mode for Pulsar policies. Valid options are: [None, Prefix]|| + +### `get-max-producers-per-topic` +Get maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-producers-per-topic tenant/namespace + +``` + +### `set-max-producers-per-topic` +Set maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-producers-per-topic tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-p`, `--max-producers-per-topic`|maxProducersPerTopic for a namespace|0| + +### `get-max-consumers-per-topic` +Get maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-topic tenant/namespace + +``` + +### `set-max-consumers-per-topic` +Set maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-topic tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-topic`|maxConsumersPerTopic for a namespace|0| + +### `get-max-consumers-per-subscription` +Get maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-subscription tenant/namespace + +``` + +### `set-max-consumers-per-subscription` +Set maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-subscription tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-subscription`|maxConsumersPerSubscription for a namespace|0| + +### `get-max-unacked-messages-per-subscription` +Get maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-subscription tenant/namespace + +``` + +### `set-max-unacked-messages-per-subscription` +Set maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-subscription tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-subscription`|maxUnackedMessagesPerSubscription for a namespace|-1| + +### `get-max-unacked-messages-per-consumer` +Get maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-consumer tenant/namespace + +``` + +### `set-max-unacked-messages-per-consumer` +Set maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-consumer tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-consumer`|maxUnackedMessagesPerConsumer for a namespace|-1| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace + +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-threshold tenant/namespace + +``` + +### `set-offload-threshold` +Set offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-threshold tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + +### `get-offload-deletion-lag` +Get offloadDeletionLag, in minutes, for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-deletion-lag tenant/namespace + +``` + +### `set-offload-deletion-lag` +Set offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-deletion-lag tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--lag`|Duration to wait after offloading a ledger segment, before deleting the copy of that segment from cluster local storage. (eg: 10m, 5h, 3d, 2w).|-1| + +### `clear-offload-deletion-lag` +Clear offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-offload-deletion-lag tenant/namespace + +``` + +### `get-schema-autoupdate-strategy` +Get the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-schema-autoupdate-strategy tenant/namespace + +``` + +### `set-schema-autoupdate-strategy` +Set the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-schema-autoupdate-strategy tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--compatibility`|Compatibility level required for new schemas created via a Producer. Possible values (Full, Backward, Forward, None).|Full| +|`-d`, `--disabled`|Disable automatic schema updates.|false| + +### `get-publish-rate` +Get the message publish rate for each topic in a namespace, in bytes as well as messages per second + +Usage + +```bash + +$ pulsar-admin namespaces get-publish-rate tenant/namespace + +``` + +### `set-publish-rate` +Set the message publish rate for each topic in a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-publish-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-m`, `--msg-publish-rate`|Threshold for number of messages per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| +|`-b`, `--byte-publish-rate`|Threshold for number of bytes per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| + +### `set-offload-policies` +Set the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-policies tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--driver`|Driver to use to offload old data to long term storage,(Possible values: S3, aws-s3, google-cloud-storage)|| +|`-r`, `--region`|The long term storage region|| +|`-b`, `--bucket`|Bucket to place offloaded ledger into|| +|`-e`, `--endpoint`|Alternative endpoint to connect to|| +|`-i`, `--aws-id`|AWS Credential Id to use when using driver S3 or aws-s3|| +|`-s`, `--aws-secret`|AWS Credential Secret to use when using driver S3 or aws-s3|| +|`-ro`, `--s3-role`|S3 Role used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-rsn`, `--s3-role-session-name`|S3 role session name used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-mbs`, `--maxBlockSize`|Max block size|64MB| +|`-rbs`, `--readBufferSize`|Read buffer size|1MB| +|`-oat`, `--offloadAfterThreshold`|Offload after threshold size (eg: 1M, 5M)|| +|`-oae`, `--offloadAfterElapsed`|Offload after elapsed in millis (or minutes, hours,days,weeks eg: 100m, 3h, 2d, 5w).|| + +### `get-offload-policies` +Get the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-policies tenant/namespace + +``` + +### `set-max-subscriptions-per-topic` +Set the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-max-subscriptions-per-topic tenant/namespace + +``` + +### `get-max-subscriptions-per-topic` +Get the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-max-subscriptions-per-topic tenant/namespace + +``` + +### `remove-max-subscriptions-per-topic` +Remove the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-max-subscriptions-per-topic tenant/namespace + +``` + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy subcommand + +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` +* `brokers` +* `broker` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy get cluster-name policy-name + +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy list cluster-name + +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy delete + +``` + +### `brokers` +List all brokers with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy brokers cluster-name + +``` + +### `broker` +Get broker with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy broker cluster-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--broker`|Broker name to get namespace-isolation policies attached to it|| + +## `topics` +Operations for managing Pulsar topics (both persistent and non-persistent). + +Usage + +```bash + +$ pulsar-admin topics subcommand + +``` + +From Pulsar 2.7.0, some namespace-level policies are available on topic level. To enable topic-level policy in Pulsar, you need to configure the following parameters in the `broker.conf` file. + +```shell + +systemTopicEnabled=true +topicLevelPoliciesEnabled=true + +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `create-missed-partitions` +* `delete-partitioned-topic` +* `create` +* `get-partitioned-topic-metadata` +* `update-partitioned-topic` +* `list-partitioned-topics` +* `list` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `create-subscription` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `partitioned-stats-internal` +* `skip` +* `clear-backlog` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` +* `get-message-by-id` +* `last-message-id` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `remove-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-deduplication` +* `set-deduplication` +* `remove-deduplication` +* `get-retention` +* `set-retention` +* `remove-retention` +* `get-dispatch-rate` +* `set-dispatch-rate` +* `remove-dispatch-rate` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `remove-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `remove-max-unacked-messages-per-consumer` +* `get-delayed-delivery` +* `set-delayed-delivery` +* `remove-delayed-delivery` +* `get-max-producers` +* `set-max-producers` +* `remove-max-producers` +* `get-max-consumers` +* `set-max-consumers` +* `remove-max-consumers` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `remove-compaction-threshold` +* `get-offload-policies` +* `set-offload-policies` +* `remove-offload-policies` +* `get-inactive-topic-policies` +* `set-inactive-topic-policies` +* `remove-inactive-topic-policies` +* `set-max-subscriptions` +* `get-max-subscriptions` +* `remove-max-subscriptions` + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage + +``` + +$ pulsar-admin topics compact persistent://tenant/namespace/topic + +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage + +```bash + +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage + +```bash + +$ pulsar-admin topics offload persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage + +```bash + +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `create-missed-partitions` +Try to create partitions for partitioned topic. The partitions of partition topic has to be created, +can be used by repair partitions when topic auto creation is disabled + +Usage + +```bash + +$ pulsar-admin topics create-missed-partitions persistent://tenant/namespace/topic + +``` + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage + +```bash + +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} + +``` + +### `create` +Creates a non-partitioned topic. A non-partitioned topic must explicitly be created by the user if allowAutoTopicCreation or createIfMissing is disabled. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage + +```bash + +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `update-partitioned-topic` +Update existing non-global partitioned topic. New updating number of partitions must be greater than existing number of partitions. + +Usage + +```bash + +$ pulsar-admin topics update-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `list-partitioned-topics` +Get the list of partitioned topics under a namespace. + +Usage + +```bash + +$ pulsar-admin topics list-partitioned-topics tenant/namespace + +``` + +### `list` +Get the list of topics under a namespace + +Usage + +``` + +$ pulsar-admin topics list tenant/cluster/namespace + +``` + +### `terminate` +Terminate a persistent topic (disallow further messages from being published on the topic) + +Usage + +```bash + +$ pulsar-admin topics terminate persistent://tenant/namespace/topic + +``` + +### `partitioned-terminate` +Terminate a persistent topic (disallow further messages from being published on the topic) + +Usage + +```bash + +$ pulsar-admin topics partitioned-terminate persistent://tenant/namespace/topic + +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a destination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage + +```bash + +$ pulsar-admin topics permissions topic + +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage + +```bash + +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage + +```bash + +$ pulsar-admin topics revoke-permission topic + +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage + +```bash + +$ pulsar-admin topics lookup topic + +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage + +```bash + +$ pulsar-admin topics bundle-range topic + +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage + +```bash + +$ pulsar-admin topics delete topic + +``` + +### `unload` +Unload a topic + +Usage + +```bash + +$ pulsar-admin topics unload topic + +``` + +### `create-subscription` +Create a new subscription on a topic. + +Usage + +```bash + +$ pulsar-admin topics create-subscription [options] persistent://tenant/namespace/topic + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-m`, `--messageId`|messageId where to create the subscription. It can be either 'latest', 'earliest' or (ledgerId:entryId)|latest| +|`-s`, `--subscription`|Subscription to reset position on|| + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage + +```bash + +$ pulsar-admin topics subscriptions topic + +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage + +```bash + +$ pulsar-admin topics unsubscribe topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| +|`-f`, `--force`|Disconnect and close all consumers and delete subscription forcefully|false| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics stats topic + +``` + +:::note + +The unit of `storageSize` and `averageMsgSize` is Byte. + +::: + +### `stats-internal` +Get the internal stats for the topic + +Usage + +```bash + +$ pulsar-admin topics stats-internal topic + +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage + +```bash + +$ pulsar-admin topics info-internal topic + +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + +### `partitioned-stats-internal` +Get the internal stats for the partitioned topic and its connected producers and consumers. All the rates are computed over a 1 minute window and are relative the last completed 1 minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats-internal topic + +``` + +### `skip` +Skip some messages for the subscription + +Usage + +```bash + +$ pulsar-admin topics skip topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `clear-backlog` +Clear backlog (skip all the messages) for the subscription + +Usage + +```bash + +$ pulsar-admin topics clear-backlog topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage + +```bash + +$ pulsar-admin topics expire-messages topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage + +```bash + +$ pulsar-admin topics expire-messages-all-subscriptions topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage + +```bash + +$ pulsar-admin topics peek-messages topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to a position that is closest to timestamp or messageId. + +Usage + +```bash + +$ pulsar-admin topics reset-cursor topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time in minutes to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| +|`-m`, `--messageId`| The message ID to reset back to (`ledgerId:entryId` or earliest or latest). || + +### `get-message-by-id` +Get message by ledger id and entry id + +Usage + +```bash + +$ pulsar-admin topics get-message-by-id topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-l`, `--ledgerId`|The ledger id |0| +|`-e`, `--entryId`|The entry id |0| + +### `last-message-id` +Get the last commit message ID of the topic. + +Usage + +```bash + +$ pulsar-admin topics last-message-id persistent://tenant/namespace/topic + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-backlog-quotas tenant/namespace/topic + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a topic. + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-lt`, `--limitTime`|Time limit in second, non-positive number for disabling time limit. (for example 3600 for 1 hour)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| +|`-t`, `--type`|Backlog quota type to set. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin topics set-backlog-quota tenant/namespace/topic options + +``` + +Example + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limit 2G \ +--policy producer_request_hold + +``` + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a topic. + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--type`|Backlog quota type to remove. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin topics remove-backlog-quota tenant/namespace/topic + +``` + +### `get-persistence` +Get the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-persistence tenant/namespace/topic + +``` + +### `set-persistence` +Set the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-persistence tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-e`, `--bookkeeper-ensemble`|Number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-a`, `--bookkeeper-ack-quorum`|Number of acks (guaranteed copies) to wait for each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + +### `remove-persistence` +Remove the persistence policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-persistence tenant/namespace/topic + +``` + +### `get-message-ttl` +Get the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-message-ttl tenant/namespace/topic + +``` + +### `set-message-ttl` +Set the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-message-ttl tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL for a topic in second, allowed range from 1 to `Integer.MAX_VALUE` |0| + +### `remove-message-ttl` +Remove the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-message-ttl tenant/namespace/topic + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified topic.|false| +|`--disable`, `-d`|Disable message deduplication on the specified topic.|false| + +### `get-deduplication` +Get a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-deduplication tenant/namespace/topic + +``` + +### `set-deduplication` +Set a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-deduplication tenant/namespace/topic options + +``` + +### `remove-deduplication` +Remove a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-deduplication tenant/namespace/topic + +``` + +## `tenants` +Operations for managing tenants + +Usage + +```bash + +$ pulsar-admin tenants subcommand + +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage + +```bash + +$ pulsar-admin tenants list + +``` + +### `get` +Gets the configuration of a tenant + +Usage + +```bash + +$ pulsar-admin tenants get tenant-name + +``` + +### `create` +Creates a new tenant + +Usage + +```bash + +$ pulsar-admin tenants create tenant-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage + +```bash + +$ pulsar-admin tenants update tenant-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage + +```bash + +$ pulsar-admin tenants delete tenant-name + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-f`, `--force`|Delete a tenant forcefully by deleting all namespaces under it.|false| + + +## `resource-quotas` +Operations for managing resource quotas + +Usage + +```bash + +$ pulsar-admin resource-quotas subcommand + +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas get options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas set options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specified namespace bundle's resource quota to a default value. + +Usage + +```bash + +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage + +``` + +$ pulsar-admin schemas subcommand + +``` + +Subcommands +* `upload` +* `delete` +* `get` +* `extract` + + +### `upload` +Upload the schema definition for a topic + +Usage + +```bash + +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage + +```bash + +$ pulsar-admin schemas delete persistent://tenant/namespace/topic + +``` + +### `get` +Retrieve the schema definition associated with a topic (at a given version if version is supplied). + +Usage + +```bash + +$ pulsar-admin schemas get persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrieve for a topic.|| + +### `extract` +Provide the schema definition for a topic via Java class name contained in a JAR file + +Usage + +```bash + +$ pulsar-admin schemas extract persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--classname`|The Java class name|| +|`-j`, `--jar`|A path to the JAR file which contains the above Java class|| +|`-t`, `--type`|The type of the schema (avro or json)|| diff --git a/site2/website/versioned_docs/version-2.10.x/reference-rest-api-overview.md b/site2/website/versioned_docs/version-2.10.x/reference-rest-api-overview.md new file mode 100644 index 0000000000000..8e3d410112b87 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-rest-api-overview.md @@ -0,0 +1,18 @@ +--- +id: reference-rest-api-overview +title: Pulsar REST APIs +sidebar_label: "Pulsar REST APIs" +--- + +A REST API (also known as RESTful API, REpresentational State Transfer Application Programming Interface) is a set of definitions and protocols for building and integrating application software, using HTTP requests to GET, PUT, POST, and DELETE data following the REST standards. In essence, REST API is a set of remote calls using standard methods to request and return data in a specific format between two systems. + +Pulsar provides a variety of REST APIs that enable you to interact with Pulsar to retrieve information or perform an action. + +| REST API category | Description | +| --- | --- | +| [Admin](/admin-rest-api/?version=master) | REST APIs for administrative operations.| +| [Functions](/functions-rest-api/?version=master) | REST APIs for function-specific operations.| +| [Sources](/source-rest-api/?version=master) | REST APIs for source-specific operations.| +| [Sinks](/sink-rest-api/?version=master) | REST APIs for sink-specific operations.| +| [Packages](/packages-rest-api/?version=master) | REST APIs for package-specific operations. A package can be a group of functions, sources, and sinks.| + diff --git a/site2/website/versioned_docs/version-2.10.x/reference-terminology.md b/site2/website/versioned_docs/version-2.10.x/reference-terminology.md new file mode 100644 index 0000000000000..e5099141c3231 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/reference-terminology.md @@ -0,0 +1,176 @@ +--- +id: reference-terminology +title: Pulsar Terminology +sidebar_label: "Terminology" +original_id: reference-terminology +--- + +Here is a glossary of terms related to Apache Pulsar: + +### Concepts + +#### Pulsar + +Pulsar is a distributed messaging system originally created by Yahoo but now under the stewardship of the Apache Software Foundation. + +#### Message + +Messages are the basic unit of Pulsar. They're what [producers](#producer) publish to [topics](#topic) +and what [consumers](#consumer) then consume from topics. + +#### Topic + +A named channel used to pass messages published by [producers](#producer) to [consumers](#consumer) who +process those [messages](#message). + +#### Partitioned Topic + +A topic that is served by multiple Pulsar [brokers](#broker), which enables higher throughput. + +#### Namespace + +A grouping mechanism for related [topics](#topic). + +#### Namespace Bundle + +A virtual group of [topics](#topic) that belong to the same [namespace](#namespace). A namespace bundle +is defined as a range between two 32-bit hashes, such as 0x00000000 and 0xffffffff. + +#### Tenant + +An administrative unit for allocating capacity and enforcing an authentication/authorization scheme. + +#### Subscription + +A lease on a [topic](#topic) established by a group of [consumers](#consumer). Pulsar has four subscription +modes (exclusive, shared, failover and key_shared). + +#### Pub-Sub + +A messaging pattern in which [producer](#producer) processes publish messages on [topics](#topic) that +are then consumed (processed) by [consumer](#consumer) processes. + +#### Producer + +A process that publishes [messages](#message) to a Pulsar [topic](#topic). + +#### Consumer + +A process that establishes a subscription to a Pulsar [topic](#topic) and processes messages published +to that topic by [producers](#producer). + +#### Reader + +Pulsar readers are message processors much like Pulsar [consumers](#consumer) but with two crucial differences: + +- you can specify *where* on a topic readers begin processing messages (consumers always begin with the latest + available unacked message); +- readers don't retain data or acknowledge messages. + +#### Cursor + +The subscription position for a [consumer](#consumer). + +#### Acknowledgment (ack) + +A message sent to a Pulsar broker by a [consumer](#consumer) that a message has been successfully processed. +An acknowledgement (ack) is Pulsar's way of knowing that the message can be deleted from the system; +if no acknowledgement, then the message will be retained until it's processed. + +#### Negative Acknowledgment (nack) + +When an application fails to process a particular message, it can send a "negative ack" to Pulsar +to signal that the message should be replayed at a later timer. (By default, failed messages are +replayed after a 1 minute delay). Be aware that negative acknowledgment on ordered subscription types, +such as Exclusive, Failover and Key_Shared, can cause failed messages to arrive consumers out of the original order. + +#### Unacknowledged + +A message that has been delivered to a consumer for processing but not yet confirmed as processed by the consumer. + +#### Retention Policy + +Size and time limits that you can set on a [namespace](#namespace) to configure retention of [messages](#message) +that have already been [acknowledged](#acknowledgement-ack). + +#### Multi-Tenancy + +The ability to isolate [namespaces](#namespace), specify quotas, and configure authentication and authorization +on a per-[tenant](#tenant) basis. + +#### Failure Domain + +A logical domain under a Pulsar cluster. Each logical domain contains a pre-configured list of brokers. + +#### Anti-affinity Namespaces + +A group of namespaces that have anti-affinity to each other. + +### Architecture + +#### Standalone + +A lightweight Pulsar broker in which all components run in a single Java Virtual Machine (JVM) process. Standalone +clusters can be run on a single machine and are useful for development purposes. + +#### Cluster + +A set of Pulsar [brokers](#broker) and [BookKeeper](#bookkeeper) servers (aka [bookies](#bookie)). +Clusters can reside in different geographical regions and replicate messages to one another +in a process called [geo-replication](#geo-replication). + +#### Instance + +A group of Pulsar [clusters](#cluster) that act together as a single unit. + +#### Geo-Replication + +Replication of messages across Pulsar [clusters](#cluster), potentially in different datacenters +or geographical regions. + +#### Configuration Store + +Pulsar's configuration store (previously known as configuration store) is a ZooKeeper quorum that +is used for configuration-specific tasks. A multi-cluster Pulsar installation requires just one +configuration store across all [clusters](#cluster). + +#### Topic Lookup + +A service provided by Pulsar [brokers](#broker) that enables connecting clients to automatically determine +which Pulsar [cluster](#cluster) is responsible for a [topic](#topic) (and thus where message traffic for +the topic needs to be routed). + +#### Service Discovery + +A mechanism provided by Pulsar that enables connecting clients to use just a single URL to interact +with all the [brokers](#broker) in a [cluster](#cluster). + +#### Broker + +A stateless component of Pulsar [clusters](#cluster) that runs two other components: an HTTP server +exposing a REST interface for administration and topic lookup and a [dispatcher](#dispatcher) that +handles all message transfers. Pulsar clusters typically consist of multiple brokers. + +#### Dispatcher + +An asynchronous TCP server used for all data transfers in-and-out a Pulsar [broker](#broker). The Pulsar +dispatcher uses a custom binary protocol for all communications. + +### Storage + +#### BookKeeper + +[Apache BookKeeper](http://bookkeeper.apache.org/) is a scalable, low-latency persistent log storage +service that Pulsar uses to store data. + +#### Bookie + +Bookie is the name of an individual BookKeeper server. It is effectively the storage server of Pulsar. + +#### Ledger + +An append-only data structure in [BookKeeper](#bookkeeper) that is used to persistently store messages in Pulsar [topics](#topic). + +### Functions + +Pulsar Functions are lightweight functions that can consume messages from Pulsar topics, apply custom processing logic, and, if desired, publish results to topics. diff --git a/site2/website/versioned_docs/version-2.10.x/schema-evolution-compatibility.md b/site2/website/versioned_docs/version-2.10.x/schema-evolution-compatibility.md new file mode 100644 index 0000000000000..04bd0129a74b2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/schema-evolution-compatibility.md @@ -0,0 +1,207 @@ +--- +id: schema-evolution-compatibility +title: Schema evolution and compatibility +sidebar_label: "Schema evolution and compatibility" +original_id: schema-evolution-compatibility +--- + +Normally, schemas do not stay the same over a long period of time. Instead, they undergo evolutions to satisfy new needs. + +This chapter examines how Pulsar schema evolves and what Pulsar schema compatibility check strategies are. + +## Schema evolution + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +Each `SchemaInfo` stored with a topic has a version. The version is used to manage the schema changes happening within a topic. + +The message produced with `SchemaInfo` is tagged with a schema version. When a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and use the correct schema information to deserialize data. + +### What is schema evolution? + +Schemas store the details of attributes and types. To satisfy new business requirements, you need to update schemas inevitably over time, which is called **schema evolution**. + +Any schema changes affect downstream consumers. Schema evolution ensures that the downstream consumers can seamlessly handle data encoded with both old schemas and new schemas. + +### How Pulsar schema should evolve? + +The answer is Pulsar schema compatibility check strategy. It determines how schema compares old schemas with new schemas in topics. + +For more information, see [Schema compatibility check strategy](#schema-compatibility-check-strategy). + +### How does Pulsar support schema evolution? + +1. When a producer/consumer/reader connects to a broker, the broker deploys the schema compatibility checker configured by `schemaRegistryCompatibilityCheckers` to enforce schema compatibility check. + + The schema compatibility checker is one instance per schema type. + + Currently, Avro and JSON have their own compatibility checkers, while all the other schema types share the default compatibility checker which disables schema evolution. + +2. The producer/consumer/reader sends its client `SchemaInfo` to the broker. + +3. The broker knows the schema type and locates the schema compatibility checker for that type. + +4. The broker uses the checker to check if the `SchemaInfo` is compatible with the latest schema of the topic by applying its compatibility check strategy. + + Currently, the compatibility check strategy is configured at the namespace level and applied to all the topics within that namespace. + +## Schema compatibility check strategy + +Pulsar has 8 schema compatibility check strategies, which are summarized in the following table. + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Changes allowed | Check against which schema | Upgrade first | +| --- | --- | --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | All changes are allowed | All previous versions | Any order | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution. | All changes are disabled | None | None | +| `BACKWARD` | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. |
  • Add optional fields
  • Delete fields
  • | Latest version | Consumers | +| `BACKWARD_TRANSITIVE` | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. |
  • Add optional fields
  • Delete fields
  • | All previous versions | Consumers | +| `FORWARD` | Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | Latest version | Producers | +| `FORWARD_TRANSITIVE` | Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | All previous versions | Producers | +| `FULL` | Backward and forward compatible between the schema V3 and V2. |
  • Modify optional fields
  • | Latest version | Any order | +| `FULL_TRANSITIVE` | Backward and forward compatible among the schema V3, V2, and V1. |
  • Modify optional fields
  • | All previous versions | Any order | + +### ALWAYS_COMPATIBLE and ALWAYS_INCOMPATIBLE + +| Compatibility check strategy | Definition | Note | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | None | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution, that is, any schema change is rejected. |
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • | + +#### Example + +* Example 1 + + In some situations, an application needs to store events of several different types in the same Pulsar topic. + + In particular, when developing a data model in an `Event Sourcing` style, you might have several kinds of events that affect the state of an entity. + + For example, for a user entity, there are `userCreated`, `userAddressChanged` and `userEnquiryReceived` events. The application requires that those events are always read in the same order. + + Consequently, those events need to go in the same Pulsar partition to maintain order. This application can use `ALWAYS_COMPATIBLE` to allow different kinds of events co-exist in the same topic. + +* Example 2 + + Sometimes we also make incompatible changes. + + For example, you are modifying a field type from `string` to `int`. + + In this case, you need to: + + * Upgrade all producers and consumers to the new schema versions at the same time. + + * Optionally, create a new topic and start migrating applications to use the new topic and the new schema, avoiding the need to handle two incompatible versions in the same topic. + +### BACKWARD and BACKWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`BACKWARD` | Consumers using the new schema can process data written by producers using the **last schema**. | The consumers using the schema V3 can process data written by producers using the schema V3 or V2. | +`BACKWARD_TRANSITIVE` | Consumers using the new schema can process data written by producers using **all previous schemas**. | The consumers using the schema V3 can process data written by producers using the schema V3, V2, or V1. | + +#### Example + +* Example 1 + + Remove a field. + + A consumer constructed to process events without one field can process events written with the old schema containing the field, and the consumer will ignore that field. + +* Example 2 + + You want to load all Pulsar data into a Hive data warehouse and run SQL queries against the data. + + Same SQL queries must continue to work even the data is changed. To support it, you can evolve the schemas using the `BACKWARD` strategy. + +### FORWARD and FORWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`FORWARD` | Consumers using the **last schema** can process data written by producers using a new schema, even though they may not be able to use the full capabilities of the new schema. | The consumers using the schema V3 or V2 can process data written by producers using the schema V3. | +`FORWARD_TRANSITIVE` | Consumers using **all previous schemas** can process data written by producers using a new schema. | The consumers using the schema V3, V2, or V1 can process data written by producers using the schema V3. + +#### Example + +* Example 1 + + Add a field. + + In most data formats, consumers written to process events without new fields can continue doing so even when they receive new events containing new fields. + +* Example 2 + + If a consumer has an application logic tied to a full version of a schema, the application logic may not be updated instantly when the schema evolves. + + In this case, you need to project data with a new schema onto an old schema that the application understands. + + Consequently, you can evolve the schemas using the `FORWARD` strategy to ensure that the old schema can process data encoded with the new schema. + +### FULL and FULL_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | Note | +| --- | --- | --- | --- | +| `FULL` | Schemas are both backward and forward compatible, which means: Consumers using the last schema can process data written by producers using the new schema. AND Consumers using the new schema can process data written by producers using the last schema. | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. AND Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • | +| `FULL_TRANSITIVE` | The new schema is backward and forward compatible with all previously registered schemas. | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. AND Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. | None | + +#### Example + +In some data formats, for example, Avro, you can define fields with default values. Consequently, adding or removing a field with a default value is a fully compatible change. + +:::tip + +You can set schema compatibility check strategy at the topic, namespace or broker level. For how to set the strategy, see [here](schema-manage.md/#set-schema-compatibility-check-strategy). + +::: + +## Schema verification + +When a producer or a consumer tries to connect to a topic, a broker performs some checks to verify a schema. + +### Producer + +When a producer tries to connect to a topic (suppose ignore the schema auto creation), a broker does the following checks: + +* Check if the schema carried by the producer exists in the schema registry or not. + + * If the schema is already registered, then the producer is connected to a broker and produce messages with that schema. + + * If the schema is not registered, then Pulsar verifies if the schema is allowed to be registered based on the configured compatibility check strategy. + +### Consumer +When a consumer tries to connect to a topic, a broker checks if a carried schema is compatible with a registered schema based on the configured schema compatibility check strategy. + +| Compatibility check strategy | Check logic | +| --- | --- | +| `ALWAYS_COMPATIBLE` | All pass | +| `ALWAYS_INCOMPATIBLE` | No pass | +| `BACKWARD` | Can read the last schema | +| `BACKWARD_TRANSITIVE` | Can read all schemas | +| `FORWARD` | Can read the last schema | +| `FORWARD_TRANSITIVE` | Can read the last schema | +| `FULL` | Can read the last schema | +| `FULL_TRANSITIVE` | Can read all schemas | + +## Order of upgrading clients + +The order of upgrading client applications is determined by the compatibility check strategy. + +For example, the producers using schemas to write data to Pulsar and the consumers using schemas to read data from Pulsar. + +| Compatibility check strategy | Upgrade first | Description | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Any order | The compatibility check is disabled. Consequently, you can upgrade the producers and consumers in **any order**. | +| `ALWAYS_INCOMPATIBLE` | None | The schema evolution is disabled. | +|
  • `BACKWARD`
  • `BACKWARD_TRANSITIVE`
  • | Consumers | There is no guarantee that consumers using the old schema can read data produced using the new schema. Consequently, **upgrade all consumers first**, and then start producing new data. | +|
  • `FORWARD`
  • `FORWARD_TRANSITIVE`
  • | Producers | There is no guarantee that consumers using the new schema can read data produced using the old schema. Consequently, **upgrade all producers first**
  • to use the new schema and ensure that the data already produced using the old schemas are not available to consumers, and then upgrade the consumers.
  • | +|
  • `FULL`
  • `FULL_TRANSITIVE`
  • | Any order | There is no guarantee that consumers using the old schema can read data produced using the new schema and consumers using the new schema can read data produced using the old schema. Consequently, you can upgrade the producers and consumers in **any order**. | + + + + diff --git a/site2/website/versioned_docs/version-2.10.x/schema-get-started.md b/site2/website/versioned_docs/version-2.10.x/schema-get-started.md new file mode 100644 index 0000000000000..73a05d96d7f10 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/schema-get-started.md @@ -0,0 +1,102 @@ +--- +id: schema-get-started +title: Get started +sidebar_label: "Get started" +original_id: schema-get-started +--- + +This chapter introduces Pulsar schemas and explains why they are important. + +## Schema Registry + +Type safety is extremely important in any application built around a message bus like Pulsar. + +Producers and consumers need some kind of mechanism for coordinating types at the topic level to avoid various potential problems arise. For example, serialization and deserialization issues. + +Applications typically adopt one of the following approaches to guarantee type safety in messaging. Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +#### Note +> +> Currently, the Pulsar schema registry is only available for the [Java client](client-libraries-java.md), [Go client](client-libraries-go.md), [Python client](client-libraries-python.md), and [C++ client](client-libraries-cpp.md). + +### Client-side approach + +Producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. + +If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as moisture sensor readings. + +Producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. + +### Server-side approach + +Producers and consumers inform the system which data types can be transmitted via the topic. + +With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +## Why use schema + +When a schema is enabled, Pulsar does parse data, it takes bytes as inputs and sends bytes as outputs. While data has meaning beyond bytes, you need to parse data and might encounter parse exceptions which mainly occur in the following situations: + +* The field does not exist + +* The field type has changed (for example, `string` is changed to `int`) + +There are a few methods to prevent and overcome these exceptions, for example, you can catch exceptions when parsing errors, which makes code hard to maintain; or you can adopt a schema management system to perform schema evolution, not to break downstream applications, and enforces type safety to max extend in the language you are using, the solution is Pulsar Schema. + +Pulsar schema enables you to use language-specific types of data when constructing and handling messages from simple types like `string` to more complex application-specific types. + +**Example** + +You can use the _User_ class to define the messages sent to Pulsar topics. + +``` + +public class User { + String name; + int age; +} + +``` + +When constructing a producer with the _User_ class, you can specify a schema or not as below. + +### Without schema + +If you construct a producer without specifying a schema, then the producer can only produce messages of type `byte[]`. If you have a POJO class, you need to serialize the POJO into bytes before sending messages. + +**Example** + +``` + +Producer producer = client.newProducer() + .topic(topic) + .create(); +User user = new User("Tom", 28); +byte[] message = … // serialize the `user` by yourself; +producer.send(message); + +``` + +### With schema + +If you construct a producer with specifying a schema, then you can send a class to a topic directly without worrying about how to serialize POJOs into bytes. + +**Example** + +This example constructs a producer with the _JSONSchema_, and you can send the _User_ class to topics directly without worrying about how to serialize it into bytes. + +``` + +Producer producer = client.newProducer(JSONSchema.of(User.class)) + .topic(topic) + .create(); +User user = new User("Tom", 28); +producer.send(user); + +``` + +### Summary + +When constructing a producer with a schema, you do not need to serialize messages into bytes, instead Pulsar schema does this job in the background. diff --git a/site2/website/versioned_docs/version-2.10.x/schema-manage.md b/site2/website/versioned_docs/version-2.10.x/schema-manage.md new file mode 100644 index 0000000000000..e62818c7e823f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/schema-manage.md @@ -0,0 +1,850 @@ +--- +id: schema-manage +title: Manage schema +sidebar_label: "Manage schema" +original_id: schema-manage +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide demonstrates the ways to manage schemas: + +* Automatically + + * [Schema AutoUpdate](#schema-autoupdate) + +* Manually + + * [Schema manual management](#schema-manual-management) + + * [Custom schema storage](#custom-schema-storage) + +## Schema AutoUpdate + +If a schema passes the schema compatibility check, Pulsar producer automatically updates this schema to the topic it produces by default. + +### AutoUpdate for producer + +For a producer, the `AutoUpdate` happens in the following cases: + +* If a **topic doesn’t have a schema**, Pulsar registers a schema automatically. + +* If a **topic has a schema**: + + * If a **producer doesn’t carry a schema**: + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **disabled** in the namespace to which the topic belongs, the producer is allowed to connect to the topic and produce data. + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **enabled** in the namespace to which the topic belongs, the producer is rejected and disconnected. + + * If a **producer carries a schema**: + + A broker performs the compatibility check based on the configured compatibility check strategy of the namespace to which the topic belongs. + + * If the schema is registered, a producer is connected to a broker. + + * If the schema is not registered: + + * If `isAllowAutoUpdateSchema` sets to **false**, the producer is rejected to connect to a broker. + + * If `isAllowAutoUpdateSchema` sets to **true**: + + * If the schema passes the compatibility check, then the broker registers a new schema automatically for the topic and the producer is connected. + + * If the schema does not pass the compatibility check, then the broker does not register a schema and the producer is rejected to connect to a broker. + +![AutoUpdate Producer](/assets/schema-producer.png) + +### AutoUpdate for consumer + +For a consumer, the `AutoUpdate` happens in the following cases: + +* If a **consumer connects to a topic without a schema** (which means the consumer receiving raw bytes), the consumer can connect to the topic successfully without doing any compatibility check. + +* If a **consumer connects to a topic with a schema**. + + * If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + + * If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +![AutoUpdate Consumer](/assets/schema-consumer.png) + + +### Manage AutoUpdate strategy + +You can use the `pulsar-admin` command to manage the `AutoUpdate` strategy as below: + +* [Enable AutoUpdate](#enable-autoupdate) + +* [Disable AutoUpdate](#disable-autoupdate) + +* [Adjust compatibility](#adjust-compatibility) + +#### Enable AutoUpdate + +To enable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --enable tenant/namespace + +``` + +#### Disable AutoUpdate + +To disable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --disable tenant/namespace + +``` + +Once the `AutoUpdate` is disabled, you can only register a new schema using the `pulsar-admin` command. + +#### Adjust compatibility + +To adjust the schema compatibility level on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-compatibility-strategy --compatibility tenant/namespace + +``` + +### Schema validation + +By default, `schemaValidationEnforced` is **disabled** for producers: + +* This means a producer without a schema can produce any kind of messages to a topic with schemas, which may result in producing trash data to the topic. + +* This allows non-java language clients that don’t support schema can produce messages to a topic with schemas. + +However, if you want a stronger guarantee on the topics with schemas, you can enable `schemaValidationEnforced` across the whole cluster or on a per-namespace basis. + +#### Enable schema validation + +To enable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --enable tenant/namespace + +``` + +#### Disable schema validation + +To disable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --disable tenant/namespace + +``` + +## Schema manual management + +To manage schemas, you can use one of the following methods. + +| Method | Description | +| --- | --- | +| **Admin CLI**
  • | You can use the `pulsar-admin` tool to manage Pulsar schemas, brokers, clusters, sources, sinks, topics, tenants and so on. For more information about how to use the `pulsar-admin` tool, see [here](reference-pulsar-admin.md). | +| **REST API**
  • | Pulsar exposes schema related management API in Pulsar’s admin RESTful API. You can access the admin RESTful endpoint directly to manage schemas. For more information about how to use the Pulsar REST API, see [here](/admin-rest-api/). | +| **Java Admin API**
  • | Pulsar provides Java admin library. | + +### Upload a schema + +To upload (register) a new schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `upload` subcommand. + +```bash + +$ pulsar-admin schemas upload --filename + +``` + +The `schema-definition-file` is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The `schema-definition-file` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here are examples of the `schema-definition-file` for a JSON schema. + +**Example 1** + +```json + +{ + "type": "JSON", + "schema": "{\"type\":\"record\",\"name\":\"User\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"file1\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"file2\",\"type\":\"string\",\"default\":null},{\"name\":\"file3\",\"type\":[\"null\",\"string\"],\"default\":\"dfdf\"}]}", + "properties": {} +} + +``` + +**Example 2** + +```json + +{ + "type": "STRING", + "schema": "", + "properties": { + "key1": "value1" + } +} + +``` + +
    + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/uploadSchema?version=@pulsar:version_number@} + +The post payload is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The post payload includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +void createSchema(String topic, PostSchemaPayload schemaPayload) + +``` + +The `PostSchemaPayload` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `PostSchemaPayload`: + +```java + +PulsarAdmin admin = …; + +PostSchemaPayload payload = new PostSchemaPayload(); +payload.setType("INT8"); +payload.setSchema(""); + +admin.createSchema("my-tenant/my-ns/my-topic", payload); + +``` + +
    + +
    +```` + +### Get a schema (latest) + +To get the latest schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get + +{ + "version": 0, + "type": "String", + "timestamp": 0, + "data": "string", + "properties": { + "property1": "string", + "property2": "string" + } +} + +``` + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic"); + +``` + +
    + +
    +```` + +### Get a schema (specific) + +To get a specific version of a schema, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get --version= + +``` + + + + +Send a `GET` request to a schema endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema/:version|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic, long version) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic", 1L); + +``` + +
    + +
    +```` + +### Extract a schema + +To provide a schema via a topic, you can use the following method. + +````mdx-code-block + + + + +Use the `extract` subcommand. + +```bash + +$ pulsar-admin schemas extract --classname --jar --type + +``` + + + + +```` + +### Delete a schema + +To delete a schema for a topic, you can use one of the following methods. + +:::note + +In any case, the **delete** action deletes **all versions** of a schema registered for a topic. + +::: + +````mdx-code-block + + + + +Use the `delete` subcommand. + +```bash + +$ pulsar-admin schemas delete + +``` + + + + +Send a `DELETE` request to a schema endpoint: {@inject: endpoint|DELETE|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/deleteSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", +} + +``` + +The response includes the following field: + +Field | Description | +---|---| +`version` | The schema version, which is a long number. | + + + + +```java + +void deleteSchema(String topic) + +``` + +Here is an example of deleting a schema. + +```java + +PulsarAdmin admin = …; + +admin.deleteSchema("my-tenant/my-ns/my-topic"); + +``` + + + + +```` + +## Custom schema storage + +By default, Pulsar stores various data types of schemas in [Apache BookKeeper](https://bookkeeper.apache.org) deployed alongside Pulsar. + +However, you can use another storage system if needed. + +### Implement + +To use a non-default (non-BookKeeper) storage system for Pulsar schemas, you need to implement the following Java interfaces: + +* [SchemaStorage interface](#schemastorage-interface) + +* [SchemaStorageFactory interface](#schemastoragefactory-interface) + +#### SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage** implementation, see [BookKeeperSchemaStorage](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +::: + +#### SchemaStorageFactory interface + +The `SchemaStorageFactory` interface has the following method: + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage factory** implementation, see [BookKeeperSchemaStorageFactory](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +::: + +### Deploy + +To use your custom schema storage implementation, perform the following steps. + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. + +2. Add the JAR file to the `lib` folder in your Pulsar binary or source distribution. + +3. Change the `schemaRegistryStorageClassName` configuration in `broker.conf` to your custom factory class. + +4. Start Pulsar. + +## Set schema compatibility check strategy + +You can set [schema compatibility check strategy](schema-evolution-compatibility.md#schema-compatibility-check-strategy) at the topic, namespace or broker level. + +The schema compatibility check strategy set at different levels has priority: topic level > namespace level > broker level. + +- If you set the strategy at both topic and namespace level, it uses the topic-level strategy. + +- If you set the strategy at both namespace and broker level, it uses the namespace-level strategy. + +- If you do not set the strategy at any level, it uses the `FULL` strategy. For all available values, see [here](schema-evolution-compatibility.md#schema-compatibility-check-strategy). + + +### Topic level + +To set a schema compatibility check strategy at the topic level, use one of the following methods. + +````mdx-code-block + + + + +Use the [`pulsar-admin topicPolicies set-schema-compatibility-strategy`](/tools/pulsar-admin/) command. + +```shell + +pulsar-admin topicPolicies set-schema-compatibility-strategy + +``` + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v2/topics/:tenant/:namespace/:topic|operation/schemaCompatibilityStrategy?version=@pulsar:version_number@} + + + + +```java + +void setSchemaCompatibilityStrategy(String topic, SchemaCompatibilityStrategy strategy) + +``` + +Here is an example of setting a schema compatibility check strategy at the topic level. + +```java + +PulsarAdmin admin = …; + +admin.topicPolicies().setSchemaCompatibilityStrategy("my-tenant/my-ns/my-topic", SchemaCompatibilityStrategy.ALWAYS_INCOMPATIBLE); + +``` + + + + +```` +
    +To get the topic-level schema compatibility check strategy, use one of the following methods. + +````mdx-code-block + + + + +Use the [`pulsar-admin topicPolicies get-schema-compatibility-strategy`](/tools/pulsar-admin/) command. + +```shell + +pulsar-admin topicPolicies get-schema-compatibility-strategy + +``` + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic|operation/schemaCompatibilityStrategy?version=@pulsar:version_number@} + + + + +```java + +SchemaCompatibilityStrategy getSchemaCompatibilityStrategy(String topic, boolean applied) + +``` + +Here is an example of getting the topic-level schema compatibility check strategy. + +```java + +PulsarAdmin admin = …; + +// get the current applied schema compatibility strategy +admin.topicPolicies().getSchemaCompatibilityStrategy("my-tenant/my-ns/my-topic", true); + +// only get the schema compatibility strategy from topic policies +admin.topicPolicies().getSchemaCompatibilityStrategy("my-tenant/my-ns/my-topic", false); + +``` + + + + +```` +
    +To remove the topic-level schema compatibility check strategy, use one of the following methods. + +````mdx-code-block + + + + +Use the [`pulsar-admin topicPolicies remove-schema-compatibility-strategy`](/tools/pulsar-admin/) command. + +```shell + +pulsar-admin topicPolicies remove-schema-compatibility-strategy + +``` + + + + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic|operation/schemaCompatibilityStrategy?version=@pulsar:version_number@} + + + + +```java + +void removeSchemaCompatibilityStrategy(String topic) + +``` + +Here is an example of removing the topic-level schema compatibility check strategy. + +```java + +PulsarAdmin admin = …; + +admin.removeSchemaCompatibilityStrategy("my-tenant/my-ns/my-topic"); + +``` + + + + +```` + + +### Namespace level + +You can set schema compatibility check strategy at namespace level using one of the following methods. + +````mdx-code-block + + + + +Use the [`pulsar-admin namespaces set-schema-compatibility-strategy`](/tools/pulsar-admin/) command. + +```shell + +pulsar-admin namespaces set-schema-compatibility-strategy options + +``` + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace|operation/schemaCompatibilityStrategy?version=@pulsar:version_number@} + + + + +Use the [`setSchemaCompatibilityStrategy`](/api/admin/)method. + +```java + +admin.namespaces().setSchemaCompatibilityStrategy("test", SchemaCompatibilityStrategy.FULL); + +``` + + + + +```` + +### Broker level + +You can set schema compatibility check strategy at broker level by setting `schemaCompatibilityStrategy` in [`broker.conf`](https://github.com/apache/pulsar/blob/f24b4890c278f72a67fe30e7bf22dc36d71aac6a/conf/broker.conf#L1240) or [`standalone.conf`](https://github.com/apache/pulsar/blob/master/conf/standalone.conf) file. + +**Example** + +``` + +schemaCompatibilityStrategy=ALWAYS_INCOMPATIBLE + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/schema-understand.md b/site2/website/versioned_docs/version-2.10.x/schema-understand.md new file mode 100644 index 0000000000000..55bc662c66633 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/schema-understand.md @@ -0,0 +1,576 @@ +--- +id: schema-understand +title: Understand schema +sidebar_label: "Understand schema" +original_id: schema-understand +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This chapter explains the basic concepts of Pulsar schema, focuses on the topics of particular importance, and provides additional background. + +## SchemaInfo + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +The `SchemaInfo` is stored and enforced on a per-topic basis and cannot be stored at the namespace or tenant level. + +A `SchemaInfo` consists of the following fields: + +| Field | Description | +| --- | --- | +| `name` | Schema name (a string). | +| `type` | Schema type, which determines how to interpret the schema data.
  • Predefined schema: see [here](schema-understand.md#schema-type).
  • Customized schema: it is left as an empty string.
  • | +| `schema`(`payload`) | Schema data, which is a sequence of 8-bit unsigned bytes and schema-type specific. | +| `properties` | It is a user defined properties as a string/string map. Applications can use this bag for carrying any application specific logics. Possible properties might be the Git hash associated with the schema, an environment string like `dev` or `prod`. | + +**Example** + +This is the `SchemaInfo` of a string. + +```json + +{ + "name": "test-string-schema", + "type": "STRING", + "schema": "", + "properties": {} +} + +``` + +## Schema type + +Pulsar supports various schema types, which are mainly divided into two categories: + +* Primitive type + +* Complex type + +### Primitive type + +Currently, Pulsar supports the following primitive types: + +| Primitive Type | Description | +|---|---| +| `BOOLEAN` | A binary value | +| `INT8` | A 8-bit signed integer | +| `INT16` | A 16-bit signed integer | +| `INT32` | A 32-bit signed integer | +| `INT64` | A 64-bit signed integer | +| `FLOAT` | A single precision (32-bit) IEEE 754 floating-point number | +| `DOUBLE` | A double-precision (64-bit) IEEE 754 floating-point number | +| `BYTES` | A sequence of 8-bit unsigned bytes | +| `STRING` | A Unicode character sequence | +| `TIMESTAMP` (`DATE`, `TIME`) | A logic type represents a specific instant in time with millisecond precision.
    It stores the number of milliseconds since `January 1, 1970, 00:00:00 GMT` as an `INT64` value | +| INSTANT | A single instantaneous point on the time-line with nanoseconds precision| +| LOCAL_DATE | An immutable date-time object that represents a date, often viewed as year-month-day| +| LOCAL_TIME | An immutable date-time object that represents a time, often viewed as hour-minute-second. Time is represented to nanosecond precision.| +| LOCAL_DATE_TIME | An immutable date-time object that represents a date-time, often viewed as year-month-day-hour-minute-second | + +For primitive types, Pulsar does not store any schema data in `SchemaInfo`. The `type` in `SchemaInfo` is used to determine how to serialize and deserialize the data. + +Some of the primitive schema implementations can use `properties` to store implementation-specific tunable settings. For example, a `string` schema can use `properties` to store the encoding charset to serialize and deserialize strings. + +The conversions between **Pulsar schema types** and **language-specific primitive types** are as below. + +| Schema Type | Java Type| Python Type | Go Type | +|---|---|---|---| +| BOOLEAN | boolean | bool | bool | +| INT8 | byte | | int8 | +| INT16 | short | | int16 | +| INT32 | int | | int32 | +| INT64 | long | | int64 | +| FLOAT | float | float | float32 | +| DOUBLE | double | float | float64| +| BYTES | byte[], ByteBuffer, ByteBuf | bytes | []byte | +| STRING | string | str | string| +| TIMESTAMP | java.sql.Timestamp | | | +| TIME | java.sql.Time | | | +| DATE | java.util.Date | | | +| INSTANT | java.time.Instant | | | +| LOCAL_DATE | java.time.LocalDate | | | +| LOCAL_TIME | java.time.LocalDateTime | | +| LOCAL_DATE_TIME | java.time.LocalTime | | + +**Example** + +This example demonstrates how to use a string schema. + +1. Create a producer with a string schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.STRING).create(); + producer.newMessage().value("Hello Pulsar!").send(); + + ``` + +2. Create a consumer with a string schema and receive messages. + + ```java + + Consumer consumer = client.newConsumer(Schema.STRING).subscribe(); + consumer.receive(); + + ``` + +### Complex type + +Currently, Pulsar supports the following complex types: + +| Complex Type | Description | +|---|---| +| `keyvalue` | Represents a complex type of a key/value pair. | +| `struct` | Handles structured data. It supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. | + +#### keyvalue + +`Keyvalue` schema helps applications define schemas for both key and value. + +For `SchemaInfo` of `keyvalue` schema, Pulsar stores the `SchemaInfo` of key schema and the `SchemaInfo` of value schema together. + +Pulsar provides the following methods to encode a key/value pair in messages: + +* `INLINE` + +* `SEPARATED` + +You can choose the encoding type when constructing the key/value schema. + +````mdx-code-block + + + + +Key/value pairs are encoded together in the message payload. + + + + +Key is encoded in the message key and the value is encoded in the message payload. + +**Example** + +This example shows how to construct a key/value schema and then use it to produce and consume messages. + +1. Construct a key/value schema with `INLINE` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.INLINE + ); + + ``` + +2. Optionally, construct a key/value schema with `SEPARATED` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + ``` + +3. Produce messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Producer> producer = client.newProducer(kvSchema) + .topic(TOPIC) + .create(); + + final int key = 100; + final String value = "value-100"; + + // send the key/value message + producer.newMessage() + .value(new KeyValue(key, value)) + .send(); + + ``` + +4. Consume messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Consumer> consumer = client.newConsumer(kvSchema) + ... + .topic(TOPIC) + .subscriptionName(SubscriptionName).subscribe(); + + // receive key/value pair + Message> msg = consumer.receive(); + KeyValue kv = msg.getValue(); + + ``` + + + + +```` + +#### struct + +This section describes the details of type and usage of the `struct` schema. + +##### Type + +`struct` schema supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. + +|Type|Description| +---|---| +`AvroBaseStructSchema`|Pulsar uses [Avro Specification](http://avro.apache.org/docs/current/spec.html) to declare the schema definition for `AvroBaseStructSchema`, which supports `AvroSchema`, `JsonSchema`, and `ProtobufSchema`.

    This allows Pulsar:
    - to use the same tools to manage schema definitions
    - to use different serialization or deserialization methods to handle data| +`ProtobufNativeSchema`|`ProtobufNativeSchema` is based on protobuf native Descriptor.

    This allows Pulsar:
    - to use native protobuf-v3 to serialize or deserialize data
    - to use `AutoConsume` to deserialize data. + +##### Usage + +Pulsar provides the following methods to use the `struct` schema: + +* `static` + +* `generic` + +* `SchemaDefinition` + +````mdx-code-block + + + + +You can predefine the `struct` schema, which can be a POJO in Java, a `struct` in Go, or classes generated by Avro or Protobuf tools. + +**Example** + +Pulsar gets the schema definition from the predefined `struct` using an Avro library. The schema definition is the schema data stored as a part of the `SchemaInfo`. + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `struct` schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.AVRO(User.class)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `struct` schema and receive messages + + ```java + + Consumer consumer = client.newConsumer(Schema.AVRO(User.class)).subscribe(); + User user = consumer.receive(); + + ``` + + + + +Sometimes applications do not have pre-defined structs, and you can use this method to define schema and access data. + +You can define the `struct` schema using the `GenericSchemaBuilder`, generate a generic struct using `GenericRecordBuilder` and consume messages into `GenericRecord`. + +**Example** + +1. Use `RecordSchemaBuilder` to build a schema. + + ```java + + RecordSchemaBuilder recordSchemaBuilder = SchemaBuilder.record("schemaName"); + recordSchemaBuilder.field("intField").type(SchemaType.INT32); + SchemaInfo schemaInfo = recordSchemaBuilder.build(SchemaType.AVRO); + + Producer producer = client.newProducer(Schema.generic(schemaInfo)).create(); + + ``` + +2. Use `RecordBuilder` to build the struct records. + + ```java + + producer.newMessage().value(schema.newRecordBuilder() + .set("intField", 32) + .build()).send(); + + ``` + + + + +You can define the `schemaDefinition` to generate a `struct` schema. + +**Example** + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `SchemaDefinition` and send messages. + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Producer producer = client.newProducer(Schema.AVRO(schemaDefinition)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `SchemaDefinition` schema and receive messages + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Consumer consumer = client.newConsumer(Schema.AVRO(schemaDefinition)).subscribe(); + User user = consumer.receive().getValue(); + + ``` + + + + +```` + +### Auto Schema + +If you don't know the schema type of a Pulsar topic in advance, you can use AUTO schema to produce or consume generic records to or from brokers. + +| Auto Schema Type | Description | +|---|---| +| `AUTO_PRODUCE` | This is useful for transferring data **from a producer to a Pulsar topic that has a schema**. | +| `AUTO_CONSUME` | This is useful for transferring data **from a Pulsar topic that has a schema to a consumer**. | + +#### AUTO_PRODUCE + +`AUTO_PRODUCE` schema helps a producer validate whether the bytes sent by the producer is compatible with the schema of a topic. + +**Example** + +Suppose that: + +* You have a producer processing messages from a Kafka topic _K_. + +* You have a Pulsar topic _P_, and you do not know its schema type. + +* Your application reads the messages from _K_ and writes the messages to _P_. + +In this case, you can use `AUTO_PRODUCE` to verify whether the bytes produced by _K_ can be sent to _P_ or not. + +```java + +Produce pulsarProducer = client.newProducer(Schema.AUTO_PRODUCE()) + … + .create(); + +byte[] kafkaMessageBytes = … ; + +pulsarProducer.produce(kafkaMessageBytes); + +``` + +#### AUTO_CONSUME + +`AUTO_CONSUME` schema helps a Pulsar topic validate whether the bytes sent by a Pulsar topic is compatible with a consumer, that is, the Pulsar topic deserializes messages into language-specific objects using the `SchemaInfo` retrieved from broker-side. + +Currently, `AUTO_CONSUME` supports AVRO, JSON and ProtobufNativeSchema schemas. It deserializes messages into `GenericRecord`. + +**Example** + +Suppose that: + +* You have a Pulsar topic _P_. + +* You have a consumer (for example, MySQL) receiving messages from the topic _P_. + +* Your application reads the messages from _P_ and writes the messages to MySQL. + +In this case, you can use `AUTO_CONSUME` to verify whether the bytes produced by _P_ can be sent to MySQL or not. + +```java + +Consumer pulsarConsumer = client.newConsumer(Schema.AUTO_CONSUME()) + … + .subscribe(); + +Message msg = consumer.receive() ; +GenericRecord record = msg.getValue(); + +``` + +### Native Avro Schema + +When migrating or ingesting event or message data from external systems (such as Kafka and Cassandra), the events are often already serialized in Avro format. The applications producing the data typically have validated the data against their schemas (including compatibility checks) and stored them in a database or a dedicated service (such as a schema registry). The schema of each serialized data record is usually retrievable by some metadata attached to that record. In such cases, a Pulsar producer doesn't need to repeat the schema validation step when sending the ingested events to a topic. All it needs to do is passing each message or event with its schema to Pulsar. + +Hence, we provide `Schema.NATIVE_AVRO` to wrap a native Avro schema of type `org.apache.avro.Schema`. The result is a schema instance of Pulsar that accepts a serialized Avro payload without validating it against the wrapped Avro schema. + +**Example** + +```java + +org.apache.avro.Schema nativeAvroSchema = … ; + +Producer producer = pulsarClient.newProducer().topic("ingress").create(); + +byte[] content = … ; + +producer.newMessage(Schema.NATIVE_AVRO(nativeAvroSchema)).value(content).send(); + +``` + +## Schema version + +Each `SchemaInfo` stored with a topic has a version. Schema version manages schema changes happening within a topic. + +Messages produced with a given `SchemaInfo` is tagged with a schema version, so when a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and then use the `SchemaInfo` to deserialize data. + +Schemas are versioned in succession. Schema storage happens in a broker that handles the associated topics so that version assignments can be made. + +Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + +**Example** + +The following example illustrates how the schema version works. + +Suppose that a Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begins to send messages: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); + +``` + +The table below lists the possible scenarios when this connection attempt occurs and what happens in each scenario: + +| Scenario | What happens | +| --- | --- | +|
  • No schema exists for the topic.
  • | (1) The producer is created using the given schema. (2) Since no existing schema is compatible with the `SensorReading` schema, the schema is transmitted to the broker and stored. (3) Any consumer created using the same schema or topic can consume messages from the `sensor-data` topic. | +|
  • A schema already exists.
  • The producer connects using the same schema that is already stored.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible. (3) The broker attempts to store the schema in [BookKeeper](concepts-architecture-overview.md#persistent-storage) but then determines that it's already stored, so it is used to tag produced messages. |
  • A schema already exists.
  • The producer connects using a new schema that is compatible.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). | + +## How does schema work + +Pulsar schemas are applied and enforced at the **topic** level (schemas cannot be applied at the namespace or tenant level). + +Producers and consumers upload schemas to brokers, so Pulsar schemas work on the producer side and the consumer side. + +### Producer side + +This diagram illustrates how does schema work on the Producer side. + +![Schema works at the producer side](/assets/schema-producer.png) + +1. The application uses a schema instance to construct a producer instance. + + The schema instance defines the schema for the data being produced using the producer instance. + + Take AVRO as an example, Pulsar extracts schema definition from the POJO class and constructs the `SchemaInfo` that the producer needs to pass to a broker when it connects. + +2. The producer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the producer. + +5. If no, the broker verifies whether a schema can be automatically created in this namespace: + + * If `isAllowAutoUpdateSchema` sets to **true**, then a schema can be created, and the broker validates the schema based on the schema compatibility check strategy defined for the topic. + + * If `isAllowAutoUpdateSchema` sets to **false**, then a schema can not be created, and the producer is rejected to connect to the broker. + +**Tip**: + +`isAllowAutoUpdateSchema` can be set via **Pulsar admin API** or **REST API.** + +For how to set `isAllowAutoUpdateSchema` via Pulsar admin API, see [Manage AutoUpdate Strategy](schema-manage.md/#manage-autoupdate-strategy). + +6. If the schema is allowed to be updated, then the compatible strategy check is performed. + + * If the schema is compatible, the broker stores it and returns the schema version to the producer. + + All the messages produced by this producer are tagged with the schema version. + + * If the schema is incompatible, the broker rejects it. + +### Consumer side + +This diagram illustrates how does Schema work on the consumer side. + +![Schema works at the consumer side](/assets/schema-consumer.png) + +1. The application uses a schema instance to construct a consumer instance. + + The schema instance defines the schema that the consumer uses for decoding messages received from a broker. + +2. The consumer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker determines whether the topic has one of them (a schema/data/a local consumer and a local producer). + +4. If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + +5. If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +6. The consumer receives messages from the broker. + + If the schema used by the consumer supports schema versioning (for example, AVRO schema), the consumer fetches the `SchemaInfo` of the version tagged in messages and uses the passed-in schema and the schema tagged in messages to decode the messages. diff --git a/site2/website/versioned_docs/version-2.10.x/security-athenz.md b/site2/website/versioned_docs/version-2.10.x/security-athenz.md new file mode 100644 index 0000000000000..8a39fe25316d0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-athenz.md @@ -0,0 +1,98 @@ +--- +id: security-athenz +title: Authentication using Athenz +sidebar_label: "Authentication using Athenz" +original_id: security-athenz +--- + +[Athenz](https://github.com/AthenZ/athenz) is a role-based authentication/authorization system. In Pulsar, you can use Athenz role tokens (also known as *z-tokens*) to establish the identify of the client. + +## Athenz authentication settings + +A [decentralized Athenz system](https://github.com/AthenZ/athenz/blob/master/docs/decent_authz_flow.md) contains an [authori**Z**ation **M**anagement **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zts) (ZTS) server. + +To begin, you need to set up Athenz service access control. You need to create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a [tenant](reference-terminology.md#tenant) in Pulsar). + +### Create the tenant domain and service + +On the [tenant](reference-terminology.md#tenant) side, you need to do the following things: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that you need to specify the private key generated in step 2 when the Pulsar client connects to the [broker](reference-terminology.md#broker) (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to do the following things: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that you can specify any action and resource in step 2 since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +## Configure the broker for Athenz + +> ### TLS encryption +> +> Note that when you are using Athenz as an authentication provider, you had better use TLS encryption +> as it can protect role tokens from being intercepted and reused. (for more details involving TLS encryption see [Architecture - Data Model](https://github.com/AthenZ/athenz/blob/master/docs/data_model)). + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties + +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +brokerClientAuthenticationParameters={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +``` + +> A full listing of parameters is available in the `conf/broker.conf` file, you can also find the default +> values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +## Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +## Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following authentication parameters to the `conf/client.conf` config file to use Athenz with CLI tools of Pulsar: + +```properties + +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/security-authorization.md b/site2/website/versioned_docs/version-2.10.x/security-authorization.md new file mode 100644 index 0000000000000..9cfd7c8c203f6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-authorization.md @@ -0,0 +1,130 @@ +--- +id: security-authorization +title: Authentication and authorization in Pulsar +sidebar_label: "Authorization and ACLs" +original_id: security-authorization +--- + + +In Pulsar, the [authentication provider](security-overview.md#authentication-providers) is responsible for properly identifying clients and associating the clients with [role tokens](security-overview.md#role-tokens). If you only enable authentication, an authenticated role token has the ability to access all resources in the cluster. *Authorization* is the process that determines *what* clients are able to do. + +The role tokens with the most privileges are the *superusers*. The *superusers* can create and destroy tenants, along with having full access to all tenant resources. + +When a superuser creates a [tenant](reference-terminology.md#tenant), that tenant is assigned an admin role. A client with the admin role token can then create, modify and destroy namespaces, and grant and revoke permissions to *other role tokens* on those namespaces. + +## Broker and Proxy Setup + +### Enable authorization and assign superusers +You can enable the authorization and assign the superusers in the broker ([`conf/broker.conf`](reference-configuration.md#broker)) configuration files. + +```properties + +authorizationEnabled=true +superUserRoles=my-super-user-1,my-super-user-2 + +``` + +> A full list of parameters is available in the `conf/broker.conf` file. +> You can also find the default values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +Typically, you use superuser roles for administrators, clients as well as broker-to-broker authorization. When you use [geo-replication](concepts-replication.md), every broker needs to be able to publish to all the other topics of clusters. + +You can also enable the authorization for the proxy in the proxy configuration file (`conf/proxy.conf`). Once you enable the authorization on the proxy, the proxy does an additional authorization check before forwarding the request to a broker. +If you enable authorization on the broker, the broker checks the authorization of the request when the broker receives the forwarded request. + +### Proxy Roles + +By default, the broker treats the connection between a proxy and the broker as a normal user connection. The broker authenticates the user as the role configured in `proxy.conf`(see ["Enable TLS Authentication on Proxies"](security-tls-authentication.md#enable-tls-authentication-on-proxies)). However, when the user connects to the cluster through a proxy, the user rarely requires the authentication. The user expects to be able to interact with the cluster as the role for which they have authenticated with the proxy. + +Pulsar uses *Proxy roles* to enable the authentication. Proxy roles are specified in the broker configuration file, [`conf/broker.conf`](reference-configuration.md#broker). If a client that is authenticated with a broker is one of its ```proxyRoles```, all requests from that client must also carry information about the role of the client that is authenticated with the proxy. This information is called the *original principal*. If the *original principal* is absent, the client is not able to access anything. + +You must authorize both the *proxy role* and the *original principal* to access a resource to ensure that the resource is accessible via the proxy. Administrators can take two approaches to authorize the *proxy role* and the *original principal*. + +The more secure approach is to grant access to the proxy roles each time you grant access to a resource. For example, if you have a proxy role named `proxy1`, when the superuser creates a tenant, you should specify `proxy1` as one of the admin roles. When a role is granted permissions to produce or consume from a namespace, if that client wants to produce or consume through a proxy, you should also grant `proxy1` the same permissions. + +Another approach is to make the proxy role a superuser. This allows the proxy to access all resources. The client still needs to authenticate with the proxy, and all requests made through the proxy have their role downgraded to the *original principal* of the authenticated client. However, if the proxy is compromised, a bad actor could get full access to your cluster. + +You can specify the roles as proxy roles in [`conf/broker.conf`](reference-configuration.md#broker). + +```properties + +proxyRoles=my-proxy-role + +# if you want to allow superusers to use the proxy (see above) +superUserRoles=my-super-user-1,my-super-user-2,my-proxy-role + +``` + +## Administer tenants + +Pulsar [instance](reference-terminology.md#instance) administrators or some kind of self-service portal typically provisions a Pulsar [tenant](reference-terminology.md#tenant). + +You can manage tenants using the [`pulsar-admin`](reference-pulsar-admin.md) tool. + +### Create a new tenant + +The following is an example tenant creation command: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east + +``` + +This command creates a new tenant `my-tenant` that is allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identifies itself as having the role `my-admin-role` is allowed to perform all administrative tasks on this tenant. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and namespaces: + +```shell + +persistent://tenant/namespace/topic + +``` + +### Manage permissions + +You can use [Pulsar Admin Tools](admin-api-permissions.md) for managing permission in Pulsar. + +### Pulsar admin authentication + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("http://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .build(); + +``` + +To use TLS: + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("https://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .tlsTrustCertsFilePath("/path/to/trust/cert") + .build(); + +``` + +## Authorize an authenticated client with multiple roles + +When a client is identified with multiple roles in a token (the type of role claim in the token is an array) during the authentication process, Pulsar supports to check the permissions of all the roles and further authorize the client as long as one of its roles has the required permissions. + +> **Note**
    +> This authorization method is only compatible with [JWT authentication](security-jwt.md). + +To enable this authorization method, configure the authorization provider as `MultiRolesTokenAuthorizationProvider` in the `conf/broker.conf` file. + + ```properties + + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.MultiRolesTokenAuthorizationProvider + + ``` + diff --git a/site2/website/versioned_docs/version-2.10.x/security-basic-auth.md b/site2/website/versioned_docs/version-2.10.x/security-basic-auth.md new file mode 100644 index 0000000000000..5cce10fdc3fb0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-basic-auth.md @@ -0,0 +1,155 @@ +--- +id: security-basic-auth +title: Authentication using HTTP basic +sidebar_label: "Authentication using HTTP basic" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +[Basic authentication](https://en.wikipedia.org/wiki/Basic_access_authentication) is a simple authentication scheme built into the HTTP protocol, which uses base64-encoded username and password pairs as credentials. + +## Prerequisites + +Install [`htpasswd`](https://httpd.apache.org/docs/2.4/programs/htpasswd.html) in your environment to create a password file for storing username-password pairs. + +* For Ubuntu/Debian, run the following command to install `htpasswd`. + + ``` + apt install apache2-utils + ``` + +* For CentOS/RHEL, run the following command to install `htpasswd`. + + ``` + yum install httpd-tools + ``` + +## Create your authentication file + +:::note + +Currently, you can use MD5 (recommended) and CRYPT encryption to authenticate your password. + +::: + +Create a password file named `.htpasswd` with a user account `superuser/admin`: +* Use MD5 encryption (recommended): + + ``` + htpasswd -cmb /path/to/.htpasswd superuser admin + ``` + +* Use CRYPT encryption: + + ``` + htpasswd -cdb /path/to/.htpasswd superuser admin + ``` + +You can preview the content of your password file by running the following command: + +``` +cat path/to/.htpasswd +superuser:$apr1$GBIYZYFZ$MzLcPrvoUky16mLcK6UtX/ +``` + +## Enable basic authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to the `conf/broker.conf` file. If you use a standalone Pulsar, you need to add these parameters to the `conf/standalone.conf` file. + +```conf +# Configuration to enable Basic authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Enable basic authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to the `conf/proxy.conf` file. + +```conf +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Configure basic authentication in CLI tools + +[Command-line tools](/docs/next/reference-cli-tools), such as [Pulsar-admin](/tools/pulsar-admin/), [Pulsar-perf](/tools/pulsar-perf/) and [Pulsar-client](/tools/pulsar-client/), use the `conf/client.conf` file in your Pulsar installation. To configure basic authentication in Pulsar CLI tools, you need to add the following parameters to the `conf/client.conf` file. + +```conf +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +authParams={"userId":"superuser","password":"admin"} +``` + + +## Configure basic authentication in Pulsar clients + +The following example shows how to configure basic authentication when using Pulsar clients. + + + + + ```java + AuthenticationBasic auth = new AuthenticationBasic(); + auth.configure("{\"userId\":\"superuser\",\"password\":\"admin\"}"); + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650") + .authentication(auth) + .build(); + ``` + + + + + ```c++ + #include + + int main() { + pulsar::ClientConfiguration config; + AuthenticationPtr auth = pulsar::AuthBasic::create("admin", "123456") + config.setAuth(auth); + pulsar::Client client("pulsar://broker.example.com:6650/", config); + + return 0; + } + ``` + + + diff --git a/site2/website/versioned_docs/version-2.10.x/security-bouncy-castle.md b/site2/website/versioned_docs/version-2.10.x/security-bouncy-castle.md new file mode 100644 index 0000000000000..be937055d8e31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-bouncy-castle.md @@ -0,0 +1,157 @@ +--- +id: security-bouncy-castle +title: Bouncy Castle Providers +sidebar_label: "Bouncy Castle Providers" +original_id: security-bouncy-castle +--- + +## BouncyCastle Introduce + +`Bouncy Castle` is a Java library that complements the default Java Cryptographic Extension (JCE), +and it provides more cipher suites and algorithms than the default JCE provided by Sun. + +In addition to that, `Bouncy Castle` has lots of utilities for reading arcane formats like PEM and ASN.1 that no sane person would want to rewrite themselves. + +In Pulsar, security and crypto have dependencies on BouncyCastle Jars. For the detailed installing and configuring Bouncy Castle FIPS, see [BC FIPS Documentation](https://www.bouncycastle.org/documentation.html), especially the **User Guides** and **Security Policy** PDFs. + +`Bouncy Castle` provides both [FIPS](https://www.bouncycastle.org/fips_faq.html) and non-FIPS version. But in a JVM, you can not include both of the 2 versions, and you need to exclude the current version before include the other. + +In Pulsar, the security and crypto methods also depends on `Bouncy Castle`, especially in [TLS Authentication](security-tls-authentication.md) and [Transport Encryption](security-encryption.md). This document contains the configuration between BouncyCastle FIPS(BC-FIPS) and non-FIPS(BC-non-FIPS) version while using Pulsar. + +## How BouncyCastle modules packaged in Pulsar + +In Pulsar's `bouncy-castle` module, We provide 2 sub modules: `bouncy-castle-bc`(for non-FIPS version) and `bouncy-castle-bcfips`(for FIPS version), to package BC jars together to make the include and exclude of `Bouncy Castle` easier. + +To achieve this goal, we will need to package several `bouncy-castle` jars together into `bouncy-castle-bc` or `bouncy-castle-bcfips` jar. +Each of the original bouncy-castle jar is related with security, so BouncyCastle dutifully supplies signed of each JAR. +But when we do the re-package, Maven shade explodes the BouncyCastle jar file which puts the signatures into META-INF, +these signatures aren't valid for this new, uber-jar (signatures are only for the original BC jar). +Usually, You will meet error like `java.lang.SecurityException: Invalid signature file digest for Manifest main attributes`. + +You could exclude these signatures in mvn pom file to avoid above error, by + +```access transformers + +META-INF/*.SF +META-INF/*.DSA +META-INF/*.RSA + +``` + +But it can also lead to new, cryptic errors, e.g. `java.security.NoSuchAlgorithmException: PBEWithSHA256And256BitAES-CBC-BC SecretKeyFactory not available` +By explicitly specifying where to find the algorithm like this: `SecretKeyFactory.getInstance("PBEWithSHA256And256BitAES-CBC-BC","BC")` +It will get the real error: `java.security.NoSuchProviderException: JCE cannot authenticate the provider BC` + +So, we used a [executable packer plugin](https://github.com/nthuemmel/executable-packer-maven-plugin) that uses a jar-in-jar approach to preserve the BouncyCastle signature in a single, executable jar. + +### Include dependencies of BC-non-FIPS + +Pulsar module `bouncy-castle-bc`, which defined by `bouncy-castle/bc/pom.xml` contains the needed non-FIPS jars for Pulsar, and packaged as a jar-in-jar(need to provide `pkg`). + +```xml + + + org.bouncycastle + bcpkix-jdk15on + ${bouncycastle.version} + + + + org.bouncycastle + bcprov-ext-jdk15on + ${bouncycastle.version} + + +``` + +By using this `bouncy-castle-bc` module, you can easily include and exclude BouncyCastle non-FIPS jars. + +### Modules that include BC-non-FIPS module (`bouncy-castle-bc`) + +For Pulsar client, user need the bouncy-castle module, so `pulsar-client-original` will include the `bouncy-castle-bc` module, and have `pkg` set to reference the `jar-in-jar` package. +It is included as following example: + +```xml + + + org.apache.pulsar + bouncy-castle-bc + ${pulsar.version} + pkg + + +``` + +By default `bouncy-castle-bc` already included in `pulsar-client-original`, And `pulsar-client-original` has been included in a lot of other modules like `pulsar-client-admin`, `pulsar-broker`. +But for the above shaded jar and signatures reason, we should not package Pulsar's `bouncy-castle` module into `pulsar-client-all` other shaded modules directly, such as `pulsar-client-shaded`, `pulsar-client-admin-shaded` and `pulsar-broker-shaded`. +So in the shaded modules, we will exclude the `bouncy-castle` modules. + +```xml + + + + org.apache.pulsar:pulsar-client-original + + ** + + + org/bouncycastle/** + + + + +``` + +That means, `bouncy-castle` related jars are not shaded in these fat jars. + +### Module BC-FIPS (`bouncy-castle-bcfips`) + +Pulsar module `bouncy-castle-bcfips`, which defined by `bouncy-castle/bcfips/pom.xml` contains the needed FIPS jars for Pulsar. +Similar to `bouncy-castle-bc`, `bouncy-castle-bcfips` also packaged as a `jar-in-jar` package for easy include/exclude. + +```xml + + + org.bouncycastle + bc-fips + ${bouncycastlefips.version} + + + + org.bouncycastle + bcpkix-fips + ${bouncycastlefips.version} + + +``` + +### Exclude BC-non-FIPS and include BC-FIPS + +If you want to switch from BC-non-FIPS to BC-FIPS version, Here is an example for `pulsar-broker` module: + +```xml + + + org.apache.pulsar + pulsar-broker + ${pulsar.version} + + + org.apache.pulsar + bouncy-castle-bc + + + + + + org.apache.pulsar + bouncy-castle-bcfips + ${pulsar.version} + pkg + + +``` + + +For more example, you can reference module `bcfips-include-test`. + diff --git a/site2/website/versioned_docs/version-2.10.x/security-encryption.md b/site2/website/versioned_docs/version-2.10.x/security-encryption.md new file mode 100644 index 0000000000000..793233f0181fe --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-encryption.md @@ -0,0 +1,335 @@ +--- +id: security-encryption +title: Pulsar Encryption +sidebar_label: "End-to-End Encryption" +original_id: security-encryption +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Applications can use Pulsar encryption to encrypt messages on the producer side and decrypt messages on the consumer side. You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +## Asymmetric and symmetric encryption + +Pulsar uses a dynamically generated symmetric AES key to encrypt messages(data). You can use the application-provided ECDSA (Elliptic Curve Digital Signature Algorithm) or RSA (Rivest–Shamir–Adleman) key pair to encrypt the AES key(data key), so you do not have to share the secret with everyone. + +Key is a public and private key pair used for encryption or decryption. The producer key is the public key of the key pair, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. You can use this key to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key (in this case the consumer) are able to decrypt the data key which is used to decrypt the message. + +You can encrypt a message with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message. + +Pulsar does not store the encryption key anywhere in the Pulsar service. If you lose or delete the private key, your message is irretrievably lost, and is unrecoverable. + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Get started + +1. Create your ECDSA or RSA public and private key pair by using the following commands. + * ECDSA(for Java clients only) + + ```shell + + openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem + openssl ec -in test_ecdsa_privkey.pem -pubout -outform pem -out test_ecdsa_pubkey.pem + + ``` + + * RSA (for C++, Python and Node.js clients) + + ```shell + + openssl genrsa -out test_rsa_privkey.pem 2048 + openssl rsa -in test_rsa_privkey.pem -pubout -outform pkcs8 -out test_rsa_pubkey.pem + + ``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. + +3. Implement the `CryptoKeyReader` interface, specifically `CryptoKeyReader.getPublicKey()` for producer and `CryptoKeyReader.getPrivateKey()` for consumer, which Pulsar client invokes to load the key. + +4. Add the encryption key name to the producer builder: PulsarClient.newProducer().addEncryptionKey("myapp.key"). + +5. Configure a `CryptoKeyReader` to a producer, consumer or reader. + +````mdx-code-block + + + +```java + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); +String topic = "persistent://my-tenant/my-ns/my-topic"; +// RawFileKeyReader is just an example implementation that's not provided by Pulsar +CryptoKeyReader keyReader = new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem"); + +Producer producer = pulsarClient.newProducer() + .topic(topic) + .cryptoKeyReader(keyReader) + .addEncryptionKey(“myappkey”) + .create(); + +Consumer consumer = pulsarClient.newConsumer() + .topic(topic) + .subscriptionName("my-subscriber-name") + .cryptoKeyReader(keyReader) + .subscribe(); + +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.earliest) + .cryptoKeyReader(keyReader) + .create(); + +``` + + + + +```c++ + +Client client("pulsar://localhost:6650"); +std::string topic = "persistent://my-tenant/my-ns/my-topic"; +// DefaultCryptoKeyReader is a built-in implementation that reads public key and private key from files +auto keyReader = std::make_shared("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem"); + +Producer producer; +ProducerConfiguration producerConf; +producerConf.setCryptoKeyReader(keyReader); +producerConf.addEncryptionKey("myappkey"); +client.createProducer(topic, producerConf, producer); + +Consumer consumer; +ConsumerConfiguration consumerConf; +consumerConf.setCryptoKeyReader(keyReader); +client.subscribe(topic, "my-subscriber-name", consumerConf, consumer); + +Reader reader; +ReaderConfiguration readerConf; +readerConf.setCryptoKeyReader(keyReader); +client.createReader(topic, MessageId::earliest(), readerConf, reader); + +``` + + + + +```python + +from pulsar import Client, CryptoKeyReader + +client = Client('pulsar://localhost:6650') +topic = 'persistent://my-tenant/my-ns/my-topic' +# CryptoKeyReader is a built-in implementation that reads public key and private key from files +key_reader = CryptoKeyReader('test_ecdsa_pubkey.pem', 'test_ecdsa_privkey.pem') + +producer = client.create_producer( + topic=topic, + encryption_key='myappkey', + crypto_key_reader=key_reader +) + +consumer = client.subscribe( + topic=topic, + subscription_name='my-subscriber-name', + crypto_key_reader=key_reader +) + +reader = client.create_reader( + topic=topic, + start_message_id=MessageId.earliest, + crypto_key_reader=key_reader +) + +client.close() + +``` + + + + +```nodejs + +const Pulsar = require('pulsar-client'); + +(async () => { +// Create a client +const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, +}); + +// Create a producer +const producer = await client.createProducer({ + topic: 'persistent://public/default/my-topic', + sendTimeoutMs: 30000, + batchingEnabled: true, + publicKeyPath: "public-key.client-rsa.pem", + encryptionKey: "encryption-key" +}); + +// Create a consumer +const consumer = await client.subscribe({ + topic: 'persistent://public/default/my-topic', + subscription: 'sub1', + subscriptionType: 'Shared', + ackTimeoutMs: 10000, + privateKeyPath: "private-key.client-rsa.pem" +}); + +// Send messages +for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); +} +await producer.flush(); + +// Receive messages +for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); +} + +await consumer.close(); +await producer.close(); +await client.close(); +})(); + +``` + + + + +```` + +6. Below is an example of a **customized** `CryptoKeyReader` implementation. + +````mdx-code-block + + + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +``` + + + + +```c++ + +class CustomCryptoKeyReader : public CryptoKeyReader { + public: + Result getPublicKey(const std::string& keyName, std::map& metadata, + EncryptionKeyInfo& encKeyInfo) const override { + // TODO: + return ResultOk; + } + + Result getPrivateKey(const std::string& keyName, std::map& metadata, + EncryptionKeyInfo& encKeyInfo) const override { + // TODO: + return ResultOk; + } +}; + +auto keyReader = std::make_shared(/* ... */); +// TODO: create producer, consumer or reader based on keyReader here + +``` + +Besides, you can use the **default** implementation of `CryptoKeyReader` by specifying the paths of `private key` and `public key`. + + + + +Currently, **customized** `CryptoKeyReader` implementation is not supported in Python. However, you can use the **default** implementation by specifying the path of `private key` and `public key`. + + + + +Currently, **customized** `CryptoKeyReader` implementation is not supported in Node.js. However, you can use the **default** implementation by specifying the path of `private key` and `public key`. + + + + +```` + +## Key rotation +Pulsar generates a new AES data key every 4 hours or after publishing a certain number of messages. A producer fetches the asymmetric public key every 4 hours by calling CryptoKeyReader.getPublicKey() to retrieve the latest version. + +## Enable encryption at the producer application +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. You can do this in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys. +2. You grant access to one of the private keys from the pairs that producer uses. + +When producers want to encrypt the messages with multiple keys, producers add all such keys to the config. Consumer can decrypt the message as long as the consumer has access to at least one of the keys. + +If you need to encrypt the messages using 2 keys (`myapp.messagekey1` and `myapp.messagekey2`), refer to the following example. + +```java + +PulsarClient.newProducer().addEncryptionKey("myapp.messagekey1").addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypt encrypted messages at the consumer application +Consumers require to access one of the private keys to decrypt messages that the producer produces. If you want to receive encrypted messages, create a public or private key and give your public key to the producer application to encrypt messages using your public key. + +## Handle failures +* Producer/Consumer loses access to the key + * Producer action fails to indicate the cause of the failure. Application has the option to proceed with sending unencrypted messages in such cases. Call `PulsarClient.newProducer().cryptoFailureAction(ProducerCryptoFailureAction)` to control the producer behavior. The default behavior is to fail the request. + * If consumption fails due to decryption failure or missing keys in consumer, the application has the option to consume the encrypted message or discard it. Call `PulsarClient.newConsumer().cryptoFailureAction(ConsumerCryptoFailureAction)` to control the consumer behavior. The default behavior is to fail the request. Application is never able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contains batch messages, client is not able to retrieve individual messages in the batch, hence message consumption fails even if cryptoFailureAction() is set to `ConsumerCryptoFailureAction.CONSUME`. +* If decryption fails, the message consumption stops and the application notices backlog growth in addition to decryption failure messages in the client log. If the application does not have access to the private key to decrypt the message, the only option is to skip or discard backlogged messages. diff --git a/site2/website/versioned_docs/version-2.10.x/security-extending.md b/site2/website/versioned_docs/version-2.10.x/security-extending.md new file mode 100644 index 0000000000000..9c641623f8348 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-extending.md @@ -0,0 +1,83 @@ +--- +id: security-extending +title: Extend Authentication and Authorization in Pulsar +sidebar_label: "Extend Authentication and Authorization" +original_id: security-extending +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms. + +## Authentication + +You can use a custom authentication mechanism by providing the implementation in the form of two plugins. +* Client authentication plugin +* Proxy/Broker authentication plugin + +### Client authentication plugin + +For the client library, you need to implement `org.apache.pulsar.client.api.Authentication`. By entering the command below, you can pass this class when you create a Pulsar client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); + +``` + +You can implement 2 interfaces on the client side: + * [`Authentication`](/api/client/org/apache/pulsar/client/api/Authentication.html) + * [`AuthenticationDataProvider`](/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html) + +This in turn requires you to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider` and also leaves the chance to return different kinds of authentication token for different types of connection or by passing a certificate chain to use for TLS. + +You can find the following examples for different client authentication plugins: + * [Mutual TLS](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth/AuthenticationTls.java) + * [Athenz](https://github.com/apache/pulsar/blob/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth/AuthenticationAthenz.java) + * [Kerberos](https://github.com/apache/pulsar/blob/master/pulsar-client-auth-sasl/src/main/java/org/apache/pulsar/client/impl/auth/AuthenticationSasl.java) + * [JSON Web Token (JWT)](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth/AuthenticationToken.java) + * [OAuth 2.0](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth/oauth2/AuthenticationOAuth2.java) + * [Basic auth](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth/AuthenticationBasic.java) + +### Proxy/Broker authentication plugin + +On the proxy/broker side, you need to configure the corresponding plugin to validate the credentials that the client sends. The proxy and broker can support multiple authentication providers at the same time. + +In `conf/broker.conf`, you can choose to specify a list of valid providers: + +```properties + +# Authentication provider name list, which is comma separated list of class names +authenticationProviders= + +``` + +For the implementation of the `org.apache.pulsar.broker.authentication.AuthenticationProvider` interface, refer to [here](https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProvider.java). + +You can find the following examples for different broker authentication plugins: + + * [Mutual TLS](https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java) + * [Athenz](https://github.com/apache/pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java) + * [Kerberos](https://github.com/apache/pulsar/blob/master/pulsar-broker-auth-sasl/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderSasl.java) + * [JSON Web Token (JWT)](https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderToken.java) + * [Basic auth](https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderToken.java) + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" has permission to perform a certain operation. + +By default, you can use the embedded authorization provider provided by Pulsar. You can also configure a different authorization provider through a plugin. Note that although the Authentication plugin is designed for use in both the proxy and broker, the Authorization plugin is designed only for use on the broker. + +### Broker authorization plugin + +To provide a custom authorization provider, you need to implement the `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, put this class in the Pulsar broker classpath and configure the class in `conf/broker.conf`: + + ```properties + + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + + ``` + +For the implementation of the `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, refer to [here](https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authorization/AuthorizationProvider.java). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/security-jwt.md b/site2/website/versioned_docs/version-2.10.x/security-jwt.md new file mode 100644 index 0000000000000..d5cbf1553b92e --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-jwt.md @@ -0,0 +1,331 @@ +--- +id: security-jwt +title: Client authentication using tokens based on JSON Web Tokens +sidebar_label: "Authentication using JWT" +original_id: security-jwt +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +## Token authentication overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +You can use tokens to identify a Pulsar client and associate with some "principal" (or "role") that +is permitted to do some actions (eg: publish to a topic or consume from a topic). + +A user typically gets a token string from the administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like the following: + +``` + +eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application specifies the token when you create the client instance. An alternative is to pass a "token supplier" (a function that returns the token when the client library needs one). + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. You had better use TLS encryption all the time when you connect to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) for more details. + +### CLI Tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use the token authentication with CLI tools of Pulsar: + +```properties + +webServiceUrl=http://broker.example.com:8080/ +brokerServiceUrl=pulsar://broker.example.com:6650/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +authParams=token:eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +The token string can also be read from a file, for example: + +``` + +authParams=file:///path/to/token/file + +``` + +### Pulsar client + +You can use tokens to authenticate the following Pulsar clients. + +````mdx-code-block + + + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")) + .build(); + +``` + +Similarly, you can also pass a `Supplier`: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token(() -> { + // Read token from custom source + return readToken(); + })) + .build(); + +``` + + + + +```python + +from pulsar import Client, AuthenticationToken + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken('eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY')) + +``` + +Alternatively, you can also pass a `Supplier`: + +```python + +def read_token(): + with open('/path/to/token.txt') as tf: + return tf.read().strip() + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken(read_token)) + +``` + + + + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY"), +}) + +``` + +Similarly, you can also pass a `Supplier`: + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationTokenSupplier(func () string { + // Read token from custom source + return readToken() + }), +}) + +``` + + + + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setAuth(pulsar::AuthToken::createWithToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + + + + +```c# + +var client = PulsarClient.Builder() + .AuthenticateUsingToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY") + .Build(); + +``` + + + + +```` + +## Enable token authentication + +On how to enable token authentication on a Pulsar cluster, you can refer to the guide below. + +JWT supports two different kinds of keys in order to generate and validate the tokens: + + * Symmetric : + - You can use a single ***Secret*** key to generate and validate tokens. + * Asymmetric: A pair of keys consists of the Private key and the Public key. + - You can use ***Private*** key to generate tokens. + - You can use ***Public*** key to validate tokens. + +### Create a secret key + +When you use a secret key, the administrator creates the key and uses the key to generate the client tokens. You can also configure this key to brokers in order to validate the clients. + +The output file is generated in the root of your Pulsar installation directory. You can also provide an absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +Enter this command to generate a base64 encoded private key. + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Create a key pair + +With Public and Private keys, you need to create a pair of keys. Pulsar supports all algorithms that the Java JWT library (shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys)) supports. + +The output file is generated in the root of your Pulsar installation directory. You can also provide an absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * Store `my-private.key` in a safe location and only administrator can use `my-private.key` to generate new tokens. + * `my-public.key` is distributed to all Pulsar brokers. You can publicly share this file without any security concern. + +### Generate tokens + +A token is a credential associated with a user. The association is done through the "principal" or "role". In the case of JWT tokens, this field is typically referred as **subject**, though they are exactly the same concept. + +Then, you need to use this command to require the generated token to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This command prints the token string on stdout. + +Similarly, you can create a token by passing the "private" key using the command below: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, you can enter the following command to create a token with a pre-defined TTL. And then the token is automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +### Authorization + +The token itself does not have any permission associated. The authorization engine determines whether the token should have permissions or not. Once you have created the token, you can grant permission for this token to do certain actions. The following is an example. + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +### Enable token authentication on Brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### Enable token authentication on Proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`: + +The proxy uses its own token when connecting to brokers. You need to configure the role token for this key pair in the `proxyRoles` of the brokers. For more details, see the [authorization guide](security-authorization.md). + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token + +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/security-kerberos.md b/site2/website/versioned_docs/version-2.10.x/security-kerberos.md new file mode 100644 index 0000000000000..c49fa3bea1fce --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-kerberos.md @@ -0,0 +1,443 @@ +--- +id: security-kerberos +title: Authentication using Kerberos +sidebar_label: "Authentication using Kerberos" +original_id: security-kerberos +--- + +[Kerberos](https://web.mit.edu/kerberos/) is a network authentication protocol. By using secret-key cryptography, [Kerberos](https://web.mit.edu/kerberos/) is designed to provide strong authentication for client applications and server applications. + +In Pulsar, you can use Kerberos with [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) as a choice for authentication. And Pulsar uses the [Java Authentication and Authorization Service (JAAS)](https://en.wikipedia.org/wiki/Java_Authentication_and_Authorization_Service) for SASL configuration. You need to provide JAAS configurations for Kerberos authentication. + +This document introduces how to configure `Kerberos` with `SASL` between Pulsar clients and brokers and how to configure Kerberos for Pulsar proxy in detail. + +## Configuration for Kerberos between Client and Broker + +### Prerequisites + +To begin, you need to set up (or already have) a [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center). Also you need to configure and run the [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center)in advance. + +If your organization already uses a Kerberos server (for example, by using `Active Directory`), you do not have to install a new server for Pulsar. If your organization does not use a Kerberos server, you need to install one. Your Linux vendor might have packages for `Kerberos`. On how to install and configure Kerberos, refer to [Ubuntu](https://help.ubuntu.com/community/Kerberos), +[Redhat](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Managing_Smart_Cards/installing-kerberos.html). + +Note that if you use Oracle Java, you need to download JCE policy files for your Java version and copy them to the `$JAVA_HOME/jre/lib/security` directory. + +#### Kerberos principals + +If you use the existing Kerberos system, ask your Kerberos administrator for a principal for each Brokers in your cluster and for every operating system user that accesses Pulsar with Kerberos authentication(via clients and tools). + +If you have installed your own Kerberos system, you can create these principals with the following commands: + +```shell + +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +Note that *Kerberos* requires that all your hosts can be resolved with their FQDNs. + +The first part of Broker principal (for example, `broker` in `broker/{hostname}@{REALM}`) is the `serverType` of each host. The suggested values of `serverType` are `broker` (host machine runs service Pulsar Broker) and `proxy` (host machine runs service Pulsar Proxy). + +#### Configure how to connect to KDC + +You need to enter the command below to specify the path to the `krb5.conf` file for the client side and the broker side. The content of `krb5.conf` file indicates the default Realm and KDC information. See [JDK’s Kerberos Requirements](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html) for more details. + +```shell + +-Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +Here is an example of the krb5.conf file: + +In the configuration file, `EXAMPLE.COM` is the default realm; `kdc = localhost:62037` is the kdc server url for realm `EXAMPLE.COM `: + +``` + +[libdefaults] + default_realm = EXAMPLE.COM + +[realms] + EXAMPLE.COM = { + kdc = localhost:62037 + } + +``` + +Usually machines configured with kerberos already have a system wide configuration and this configuration is optional. + +#### JAAS configuration file + +You need JAAS configuration file for the client side and the broker side. JAAS configuration file provides the section of information that is used to connect KDC. Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +You need to set the `JAAS` configuration file path as JVM parameter for client and broker. For example: + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf + +``` + +In the `pulsar_jaas.conf` file above + +1. `PulsarBroker` is a section name in the JAAS file that each broker uses. This section tells the broker to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarBroker` allows the broker to use the keytab specified in this section. +2. `PulsarClient` is a section name in the JASS file that each broker uses. This section tells the client to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarClient` allows the client to use the keytab specified in this section. + The following example also reuses this `PulsarClient` section in both the Pulsar internal admin configuration and in CLI command of `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`. You can also add different sections for different use cases. + +You can have 2 separate JAAS configuration files: +* the file for a broker that has sections of both `PulsarBroker` and `PulsarClient`; +* the file for a client that only has a `PulsarClient` section. + + +### Kerberos configuration for Brokers + +#### Configure the `broker.conf` file + + In the `broker.conf` file, set Kerberos related configurations. + + - Set `authenticationEnabled` to `true`; + - Set `authenticationProviders` to choose `AuthenticationProviderSasl`; + - Set `saslJaasClientAllowedIds` regex for principal that is allowed to connect to broker; + - Set `saslJaasBrokerSectionName` that corresponds to the section in JAAS configuration file for broker; + + To make Pulsar internal admin client work properly, you need to set the configuration in the `broker.conf` file as below: + - Set `brokerClientAuthenticationPlugin` to client plugin `AuthenticationSasl`; + - Set `brokerClientAuthenticationParameters` to value in JSON string `{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}`, in which `PulsarClient` is the section name in the `pulsar_jaas.conf` file, and `"serverType":"broker"` indicates that the internal admin client connects to a Pulsar Broker; + + Here is an example: + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +## Authentication settings of the broker itself. Used when the broker connects to other brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +#### Set Broker JVM parameter + + Set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_env.sh) + +You must ensure that the operating system user who starts broker can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +### Kerberos configuration for clients + +#### Java Client and Java Admin Client + +In client application, include `pulsar-client-auth-sasl` in your project dependency. + +``` + + + org.apache.pulsar + pulsar-client-auth-sasl + ${pulsar.version} + + +``` + +Configure the authentication type to use `AuthenticationSasl`, and also provide the authentication parameters to it. + +You need 2 parameters: +- `saslJaasClientSectionName`. This parameter corresponds to the section in JAAS configuration file for client; +- `serverType`. This parameter stands for whether this client connects to broker or proxy. And client uses this parameter to know which server side principal should be used. + +When you authenticate between client and broker with the setting in above JAAS configuration file, we need to set `saslJaasClientSectionName` to `PulsarClient` and set `serverType` to `broker`. + +The following is an example of creating a Java client: + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "broker"); + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +You must ensure that the operating system user who starts pulsar client can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +#### Configure CLI tools + +If you use a command-line tool (such as `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`), you need to perform the following steps: + +Step 1. Enter the command below to configure your `client.conf`. + +```shell + +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +authParams={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +Step 2. Enter the command below to set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_tools_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_tools_env.sh), +or add this line `OPTS="$OPTS -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf "` directly to the CLI tool script. + +The meaning of configurations is the same as the meaning of configurations in Java client section. + +## Kerberos configuration for working with Pulsar Proxy + +With the above configuration, client and broker can do authentication using Kerberos. + +A client that connects to Pulsar Proxy is a little different. Pulsar Proxy (as a SASL Server in Kerberos) authenticates Client (as a SASL client in Kerberos) first; and then Pulsar broker authenticates Pulsar Proxy. + +Now in comparison with the above configuration between client and broker, we show you how to configure Pulsar Proxy as follows. + +### Create principal for Pulsar Proxy in Kerberos + +You need to add new principals for Pulsar Proxy comparing with the above configuration. If you already have principals for client and broker, you only need to add the proxy principal here. + +```shell + +### add Principals for Pulsar Proxy +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey proxy/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{proxy-keytabname}.keytab proxy/{hostname}@{REALM}" +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +### Add a section in JAAS configuration file for Pulsar Proxy + +In comparison with the above configuration, add a new section for Pulsar Proxy in JAAS configuration file. + +Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarProxy { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarproxy.keytab" + principal="proxy/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +### Proxy client configuration + +Pulsar client configuration is similar with client and broker configuration, except that you need to set `serverType` to `proxy` instead of `broker`, for the reason that you need to do the Kerberos authentication between client and proxy. + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "proxy"); // ** here is the different ** + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +### Kerberos configuration for Pulsar proxy service + +In the `proxy.conf` file, set Kerberos related configuration. Here is an example: + +```shell + +## related to authenticate client. +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarProxy + +## related to be authenticated by broker +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarProxy", "serverType":"broker"} +forwardAuthorizationCredentials=true + +``` + +The first part relates to authenticating between client and Pulsar Proxy. In this phase, client works as SASL client, while Pulsar Proxy works as SASL server. + +The second part relates to authenticating between Pulsar Proxy and Pulsar Broker. In this phase, Pulsar Proxy works as SASL client, while Pulsar Broker works as SASL server. + +### Broker side configuration. + +The broker side configuration file is the same with the above `broker.conf`, you do not need special configuration for Pulsar Proxy. + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +``` + +## Regarding authorization and role token + +For Kerberos authentication, we usually use the authenticated principal as the role token for Pulsar authorization. For more information of authorization in Pulsar, see [security authorization](security-authorization.md). + +If you enable 'authorizationEnabled', you need to set `superUserRoles` in `broker.conf` that corresponds to the name registered in kdc. + +For example: + +```bash + +superUserRoles=client/{clientIp}@EXAMPLE.COM + +``` + +## Regarding authentication between ZooKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Zookeeper. According to [ZooKeeper document](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Client-Server+mutual+authentication), you need these settings in `conf/zookeeper.conf`: + +``` + +authProvider.1=org.apache.zookeeper.server.auth.SASLAuthenticationProvider +requireClientAuthScheme=sasl + +``` + +Enter the following commands to add a section of `Client` configurations in the file `pulsar_jaas.conf`, which Pulsar Broker uses: + +``` + + Client { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with ZooKeeper. + +## Regarding authentication between BookKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Bookie. According to [BookKeeper document](http://bookkeeper.apache.org/docs/latest/security/sasl/), you need to add `bookkeeperClientAuthenticationPlugin` parameter in `broker.conf`: + +``` + +bookkeeperClientAuthenticationPlugin=org.apache.bookkeeper.sasl.SASLClientProviderFactory + +``` + +In this setting, `SASLClientProviderFactory` creates a BookKeeper SASL client in a Broker, and the Broker uses the created SASL client to authenticate with a Bookie node. + +Enter the following commands to add a section of `BookKeeper` configurations in the `pulsar_jaas.conf` that Pulsar Broker uses: + +``` + + BookKeeper { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with Bookie. diff --git a/site2/website/versioned_docs/version-2.10.x/security-oauth2.md b/site2/website/versioned_docs/version-2.10.x/security-oauth2.md new file mode 100644 index 0000000000000..d32568c8c1f2c --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-oauth2.md @@ -0,0 +1,282 @@ +--- +id: security-oauth2 +title: Client authentication using OAuth 2.0 access tokens +sidebar_label: "Authentication using OAuth 2.0 access tokens" +original_id: security-oauth2 +--- + +Pulsar supports authenticating clients using OAuth 2.0 access tokens. You can use OAuth 2.0 access tokens to identify a Pulsar client and associate the Pulsar client with some "principal" (or "role"), which is permitted to do some actions, such as publishing messages to a topic or consume messages from a topic. + +This module is used to support the [Pulsar client authentication plugin](security-extending.md/#client-authentication-plugin) for OAuth 2.0. After communicating with the OAuth 2.0 server, the Pulsar client gets an `access token` from the OAuth 2.0 server, and passes this `access token` to the Pulsar broker to do the authentication. The broker can use the `org.apache.pulsar.broker.authentication.AuthenticationProviderToken`. Or, you can add your own `AuthenticationProvider` to make it with this module. + +## Authentication provider configuration + +This library allows you to authenticate the Pulsar client by using an access token that is obtained from an OAuth 2.0 authorization service, which acts as a _token issuer_. + +### Authentication types + +The authentication type determines how to obtain an access token through an OAuth 2.0 authorization flow. + +:::note + +Currently, the Pulsar Java client only supports the `client_credentials` authentication type. + +::: + +#### Client credentials + +The following table lists parameters supported for the `client credentials` authentication type. + +| Parameter | Description | Example | Required or not | +| --- | --- | --- | --- | +| `type` | OAuth 2.0 authentication type. | `client_credentials` (default) | Optional | +| `issuerUrl` | URL of the authentication provider which allows the Pulsar client to obtain an access token | `https://accounts.google.com` | Required | +| `privateKey` | URL to a JSON credentials file | Support the following pattern formats:
  • `file:///path/to/file`
  • `file:/path/to/file`
  • `data:application/json;base64,`
  • | Required | +| `audience` | An OAuth 2.0 "resource server" identifier for the Pulsar cluster | `https://broker.example.com` | Optional | +| `scope` | Scope of an access request.
    For more more information, see [access token scope](https://datatracker.ietf.org/doc/html/rfc6749#section-3.3). | api://pulsar-cluster-1/.default | Optional | + +The credentials file contains service account credentials used with the client authentication type. The following shows an example of a credentials file `credentials_file.json`. + +```json + +{ + "type": "client_credentials", + "client_id": "d9ZyX97q1ef8Cr81WHVC4hFQ64vSlDK3", + "client_secret": "on1uJ...k6F6R", + "client_email": "1234567890-abcdefghijklmnopqrstuvwxyz@developer.gserviceaccount.com", + "issuer_url": "https://accounts.google.com" +} + +``` + +In the above example, the authentication type is set to `client_credentials` by default. And the fields "client_id" and "client_secret" are required. + +### Typical original OAuth2 request mapping + +The following shows a typical original OAuth2 request, which is used to obtain the access token from the OAuth2 server. + +```bash + +curl --request POST \ + --url https://dev-kt-aa9ne.us.auth0.com/oauth/token \ + --header 'content-type: application/json' \ + --data '{ + "client_id":"Xd23RHsUnvUlP7wchjNYOaIfazgeHd9x", + "client_secret":"rT7ps7WY8uhdVuBTKWZkttwLdQotmdEliaM5rLfmgNibvqziZ-g07ZH52N_poGAb", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "grant_type":"client_credentials"}' + +``` + +In the above example, the mapping relationship is shown as below. + +- The `issuerUrl` parameter in this plugin is mapped to `--url https://dev-kt-aa9ne.us.auth0.com`. +- The `privateKey` file parameter in this plugin should at least contains the `client_id` and `client_secret` fields. +- The `audience` parameter in this plugin is mapped to `"audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"`. This field is only used by some identity providers. + +## Client Configuration + +You can use the OAuth2 authentication provider with the following Pulsar clients. + +### Java client + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +import org.apache.pulsar.client.impl.auth.oauth2.AuthenticationFactoryOAuth2; + +URL issuerUrl = new URL("https://dev-kt-aa9ne.us.auth0.com"); +URL credentialsUrl = new URL("file:///path/to/KeyFile.json"); +String audience = "https://dev-kt-aa9ne.us.auth0.com/api/v2/"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(issuerUrl, credentialsUrl, audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"./key/path/..","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + +### C++ client + +The C++ client is similar to the Java client. You need to provide the parameters of `issuerUrl`, `private_key` (the credentials file path), and `audience`. + +```c++ + +#include + +pulsar::ClientConfiguration config; +std::string params = R"({ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "../../pulsar-broker/src/test/resources/authentication/token/cpp_credentials_file.json", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/"})"; + +config.setAuth(pulsar::AuthOauth2::create(params)); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + +### Go client + +To enable OAuth2 authentication in Go client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Go client. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + +### Python client + +To enable OAuth2 authentication in Python client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Python client. + +```python + +from pulsar import Client, AuthenticationOauth2 + +params = ''' +{ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "/path/to/privateKey", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/" +} +''' + +client = Client("pulsar://my-cluster:6650", authentication=AuthenticationOauth2(params)) + +``` + +### Node.js client + +To enable OAuth2 authentication in Node.js client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Node.js client. + +```JavaScript + + const Pulsar = require('pulsar-client'); + const issuer_url = process.env.ISSUER_URL; + const private_key = process.env.PRIVATE_KEY; + const audience = process.env.AUDIENCE; + const scope = process.env.SCOPE; + const service_url = process.env.SERVICE_URL; + const client_id = process.env.CLIENT_ID; + const client_secret = process.env.CLIENT_SECRET; + (async () => { + const params = { + issuer_url: issuer_url + } + if (private_key.length > 0) { + params['private_key'] = private_key + } else { + params['client_id'] = client_id + params['client_secret'] = client_secret + } + if (audience.length > 0) { + params['audience'] = audience + } + if (scope.length > 0) { + params['scope'] = scope + } + const auth = new Pulsar.AuthenticationOauth2(params); + // Create a client + const client = new Pulsar.Client({ + serviceUrl: service_url, + tlsAllowInsecureConnection: true, + authentication: auth, + }); + await client.close(); + })(); + +``` + +:::note + +The support for OAuth2 authentication is only available in Node.js client 1.6.2 and later versions. + +::: + +## CLI configuration + +This section describes how to use Pulsar CLI tools to connect a cluster through OAuth2 authentication plugin. + +### pulsar-admin + +This example shows how to use pulsar-admin to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-admin --admin-url https://streamnative.cloud:443 \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +tenants list + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-client + +This example shows how to use pulsar-client to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-client \ +--url SERVICE_URL \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +produce test-topic -m "test-message" -n 10 + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-perf + +This example shows how to use pulsar-perf to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-perf produce --service-url pulsar+ssl://streamnative.cloud:6651 \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +-r 1000 -s 1024 test-topic + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). diff --git a/site2/website/versioned_docs/version-2.10.x/security-overview.md b/site2/website/versioned_docs/version-2.10.x/security-overview.md new file mode 100644 index 0000000000000..d03b8c85c38f1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-overview.md @@ -0,0 +1,37 @@ +--- +id: security-overview +title: Pulsar security overview +sidebar_label: "Overview" +original_id: security-overview +--- + +As the central message bus for a business, Apache Pulsar is frequently used for storing mission-critical data. Therefore, enabling security features in Pulsar is crucial. + +By default, Pulsar configures no encryption, authentication, or authorization. Any client can communicate to Apache Pulsar via plain text service URLs. So we must ensure that Pulsar accessing via these plain text service URLs is restricted to trusted clients only. In such cases, you can use Network segmentation and/or authorization ACLs to restrict access to trusted IPs. If you use neither, the state of cluster is wide open and anyone can access the cluster. + +Pulsar supports a pluggable authentication mechanism. And Pulsar clients use this mechanism to authenticate with brokers and proxies. You can also configure Pulsar to support multiple authentication sources. + +The Pulsar broker validates the authentication credentials when a connection is established. After the initial connection is authenticated, the "principal" token is stored for authorization though the connection is not re-authenticated. The broker periodically checks the expiration status of every `ServerCnx` object. You can set the `authenticationRefreshCheckSeconds` on the broker to control the frequency to check the expiration status. By default, the `authenticationRefreshCheckSeconds` is set to 60s. When the authentication is expired, the broker forces to re-authenticate the connection. If the re-authentication fails, the broker disconnects the client. + +The broker supports learning whether a particular client supports authentication refreshing. If a client supports authentication refreshing and the credential is expired, the authentication provider calls the `refreshAuthentication` method to initiate the refreshing process. If a client does not support authentication refreshing and the credential is expired, the broker disconnects the client. + +You had better secure the service components in your Apache Pulsar deployment. + +## Role tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, which can represent a single client or multiple clients. You can use roles to control permission for clients to produce or consume from certain topics, administer the configuration for tenants, and so on. + +Apache Pulsar uses a [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign a *role token* to that client. This role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication providers + +Currently Pulsar supports the following authentication providers: + +- [TLS authentication](security-tls-authentication.md) +- [Athenz authentication](security-athenz.md) +- [Kerberos authentication](security-kerberos.md) +- [JSON Web Token (JWT) authentication](security-jwt.md) +- [OAuth 2.0 authentication](security-oauth2.md) +- [HTTP basic authentication](security-basic-auth.md) + + diff --git a/site2/website/versioned_docs/version-2.10.x/security-policy-and-supported-versions.md b/site2/website/versioned_docs/version-2.10.x/security-policy-and-supported-versions.md new file mode 100644 index 0000000000000..31f8cf061b805 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-policy-and-supported-versions.md @@ -0,0 +1,63 @@ +--- +id: security-policy-and-supported-versions +title: Security Policy and Supported Versions +sidebar_label: "Security Policy and Supported Versions" +original_id: security-policy-and-supported-versions +--- + +## Using Pulsar's Security Features + +You can find documentation on Pulsar's available security features and how to use them here: +https://pulsar.apache.org/docs/en/security-overview/. + +## Security Vulnerability Announcements + +The Pulsar community will announce security vulnerabilities and how to mitigate them on the [users@pulsar.apache.org](mailto:users@pulsar.apache.org). +For instructions on how to subscribe, please see https://pulsar.apache.org/contact/. + +## Versioning Policy + +The Pulsar project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). Existing releases can expect +patches for bugs and security vulnerabilities. New features will target minor releases. + +When upgrading an existing cluster, it is important to upgrade components linearly through each minor version. For +example, when upgrading from 2.8.x to 2.10.x, it is important to upgrade to 2.9.x before going to 2.10.x. + +## Supported Versions + +Feature release branches will be maintained with security fix and bug fix releases for a period of at least 12 months +after initial release. For example, branch 2.5.x is no longer considered maintained as of January 2021, 12 months after +the release of 2.5.0 in January 2020. No more 2.5.x releases should be expected at this point, even to fix security +vulnerabilities. + +Note that a minor version can be maintained past it's 12 month initial support period. For example, version 2.7 is still +actively maintained. + +Security fixes will be given priority when it comes to back porting fixes to older versions that are within the +supported time window. It is challenging to decide which bug fixes to back port to old versions. As such, the latest +versions will have the most bug fixes. + +When 3.0.0 is released, the community will decide how to continue supporting 2.x. It is possible that the last minor +release within 2.x will be maintained for longer as an “LTS” release, but it has not been officially decided. + +The following table shows version support timelines and will be updated with each release. + +| Version | Supported | Initial Release | At Least Until | +|:-------:|:------------------:|:---------------:|:--------------:| +| 2.10.x | :white_check_mark: | April 2022 | April 2023 | +| 2.9.x | :white_check_mark: | November 2021 | November 2022 | +| 2.8.x | :white_check_mark: | June 2021 | June 2022 | +| 2.7.x | :white_check_mark: | November 2020 | November 2021 | +| 2.6.x | :x: | June 2020 | June 2021 | +| 2.5.x | :x: | January 2020 | January 2021 | +| 2.4.x | :x: | July 2019 | July 2020 | +| < 2.3.x | :x: | - | - | + +If there is ambiguity about which versions of Pulsar are actively supported, please ask on the [users@pulsar.apache.org](mailto:users@pulsar.apache.org) +mailing list. + +## Release Frequency + +With the acceptance of [PIP-47 - A Time Based Release Plan](https://github.com/apache/pulsar/wiki/PIP-47%3A-Time-Based-Release-Plan), +the Pulsar community aims to complete 4 minor releases each year. Patch releases are completed based on demand as well +as need, in the event of security fixes. diff --git a/site2/website/versioned_docs/version-2.10.x/security-tls-authentication.md b/site2/website/versioned_docs/version-2.10.x/security-tls-authentication.md new file mode 100644 index 0000000000000..17da5aab34d47 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-tls-authentication.md @@ -0,0 +1,222 @@ +--- +id: security-tls-authentication +title: Authentication using TLS +sidebar_label: "Authentication using TLS" +original_id: security-tls-authentication +--- + +## TLS authentication overview + +TLS authentication is an extension of [TLS transport encryption](security-tls-transport.md). Not only servers have keys and certs that the client uses to verify the identity of servers, clients also have keys and certs that the server uses to verify the identity of clients. You must have TLS transport encryption configured on your cluster before you can use TLS authentication. This guide assumes you already have TLS transport encryption configured. + +`Bouncy Castle Provider` provides TLS related cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +### Create client certificates + +Client certificates are generated using the certificate authority. Server certificates are also generated with the same certificate authority. + +The biggest difference between client certs and server certs is that the **common name** for the client certificate is the **role token** which that client is authenticated as. + +To use client certificates, you need to set `tlsRequireTrustedClientCertOnConnect=true` at the broker side. For details, refer to [TLS broker configuration](security-tls-transport.md#configure-broker). + +First, you need to enter the following command to generate the key : + +```bash + +$ openssl genrsa -out admin.key.pem 2048 + +``` + +Similar to the broker, the client expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so you need to convert it by entering the following command: + +```bash + +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in admin.key.pem -out admin.key-pk8.pem -nocrypt + +``` + +Next, enter the command below to generate the certificate request. When you are asked for a **common name**, enter the **role token** that you want this key pair to authenticate a client as. + +```bash + +$ openssl req -config openssl.cnf \ + -key admin.key.pem -new -sha256 -out admin.csr.pem + +``` + +:::note + +If openssl.cnf is not specified, read [Certificate authority](security-tls-transport.md#certificate-authority) to get the openssl.cnf. + +::: + +Then, enter the command below to sign with request with the certificate authority. Note that the client certs uses the **usr_cert** extension, which allows the cert to be used for client authentication. + +```bash + +$ openssl ca -config openssl.cnf -extensions usr_cert \ + -days 1000 -notext -md sha256 \ + -in admin.csr.pem -out admin.cert.pem + +``` + +You can get a cert, `admin.cert.pem`, and a key, `admin.key-pk8.pem` from this command. With `ca.cert.pem`, clients can use this cert and this key to authenticate themselves to brokers and proxies as the role token ``admin``. + +:::note + +If the "unable to load CA private key" error occurs and the reason of this error is "No such file or directory: /etc/pki/CA/private/cakey.pem" in this step. Try the command below: + +```bash + +$ cd /etc/pki/tls/misc/CA +$ ./CA -newca + +``` + +to generate `cakey.pem` . + +::: + +## Enable TLS authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#broker-configuration): + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# operations and publish/consume from all topics +superUserRoles=admin + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters={"tlsCertFile":"/path/my-ca/admin.cert.pem","tlsKeyFile":"/path/my-ca/admin.key-pk8.pem"} +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +``` + +## Enable TLS authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#proxy-configuration): + +The proxy should have its own client key pair for connecting to brokers. You need to configure the role token for this key pair in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/to/proxy.cert.pem,tlsKeyFile:/path/to/proxy.key-pk8.pem + +``` + +## Client configuration + +When you use TLS authentication, client connects via TLS transport. You need to configure the client to use ```https://``` and 8443 port for the web service URL, ```pulsar+ssl://``` and 6651 port for the broker service URL. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS authentication with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem + +``` + +### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .authentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", + "tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + .build(); + +``` + +### Python client + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + +### C++ client + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); + +pulsar::AuthenticationPtr auth = pulsar::AuthTls::create("/path/to/my-role.cert.pem", + "/path/to/my-role.key-pk8.pem") +config.setAuth(auth); + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); + +``` + +### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const auth = new Pulsar.AuthenticationTls({ + certificatePath: '/path/to/my-role.cert.pem', + privateKeyPath: '/path/to/my-role.key-pk8.pem', + }); + + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + authentication: auth, + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + }); +})(); + +``` + +### C# client + +```c# + +var clientCertificate = new X509Certificate2("admin.pfx"); +var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/security-tls-keystore.md b/site2/website/versioned_docs/version-2.10.x/security-tls-keystore.md new file mode 100644 index 0000000000000..8a4654a0c33ae --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-tls-keystore.md @@ -0,0 +1,345 @@ +--- +id: security-tls-keystore +title: Using TLS with KeyStore configure +sidebar_label: "Using TLS with KeyStore configure" +original_id: security-tls-keystore +--- + +## Overview + +Apache Pulsar supports [TLS encryption](security-tls-transport.md) and [TLS authentication](security-tls-authentication.md) between clients and Apache Pulsar service. +By default it uses PEM format file configuration. This page tries to describe use [KeyStore](https://en.wikipedia.org/wiki/Java_KeyStore) type configure for TLS. + + +## TLS encryption with KeyStore configure + +### Generate TLS key and certificate + +The first step of deploying TLS is to generate the key and the certificate for each machine in the cluster. +You can use Java’s `keytool` utility to accomplish this task. We will generate the key into a temporary keystore +initially for broker, so that we can export and sign it later with CA. + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -validity {validity} -genkeypair -keyalg RSA + +``` + +You need to specify two parameters in the above command: + +1. `keystore`: the keystore file that stores the certificate. The *keystore* file contains the private key of + the certificate; hence, it needs to be kept safely. +2. `validity`: the valid time of the certificate in days. + +> Ensure that common name (CN) matches exactly with the fully qualified domain name (FQDN) of the server. +The client compares the CN with the DNS domain name to ensure that it is indeed connecting to the desired server, not a malicious one. + +### Creating your own CA + +After the first step, each broker in the cluster has a public-private key pair, and a certificate to identify the machine. +The certificate, however, is unsigned, which means that an attacker can create such a certificate to pretend to be any machine. + +Therefore, it is important to prevent forged certificates by signing them for each machine in the cluster. +A `certificate authority (CA)` is responsible for signing certificates. CA works likes a government that issues passports — +the government stamps (signs) each passport so that the passport becomes difficult to forge. Other governments verify the stamps +to ensure the passport is authentic. Similarly, the CA signs the certificates, and the cryptography guarantees that a signed +certificate is computationally difficult to forge. Thus, as long as the CA is a genuine and trusted authority, the clients have +high assurance that they are connecting to the authentic machines. + +```shell + +openssl req -new -x509 -keyout ca-key -out ca-cert -days 365 + +``` + +The generated CA is simply a *public-private* key pair and certificate, and it is intended to sign other certificates. + +The next step is to add the generated CA to the clients' truststore so that the clients can trust this CA: + +```shell + +keytool -keystore client.truststore.jks -alias CARoot -import -file ca-cert + +``` + +NOTE: If you configure the brokers to require client authentication by setting `tlsRequireTrustedClientCertOnConnect` to `true` on the +broker configuration, then you must also provide a truststore for the brokers and it should have all the CA certificates that clients keys were signed by. + +```shell + +keytool -keystore broker.truststore.jks -alias CARoot -import -file ca-cert + +``` + +In contrast to the keystore, which stores each machine’s own identity, the truststore of a client stores all the certificates +that the client should trust. Importing a certificate into one’s truststore also means trusting all certificates that are signed +by that certificate. As the analogy above, trusting the government (CA) also means trusting all passports (certificates) that +it has issued. This attribute is called the chain of trust, and it is particularly useful when deploying TLS on a large BookKeeper cluster. +You can sign all certificates in the cluster with a single CA, and have all machines share the same truststore that trusts the CA. +That way all machines can authenticate all other machines. + + +### Signing the certificate + +The next step is to sign all certificates in the keystore with the CA we generated. First, you need to export the certificate from the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -certreq -file cert-file + +``` + +Then sign it with the CA: + +```shell + +openssl x509 -req -CA ca-cert -CAkey ca-key -in cert-file -out cert-signed -days {validity} -CAcreateserial -passin pass:{ca-password} + +``` + +Finally, you need to import both the certificate of the CA and the signed certificate into the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias CARoot -import -file ca-cert +keytool -keystore broker.keystore.jks -alias localhost -import -file cert-signed + +``` + +The definitions of the parameters are the following: + +1. `keystore`: the location of the keystore +2. `ca-cert`: the certificate of the CA +3. `ca-key`: the private key of the CA +4. `ca-password`: the passphrase of the CA +5. `cert-file`: the exported, unsigned certificate of the broker +6. `cert-signed`: the signed certificate of the broker + +### Configuring brokers + +Brokers enable TLS by provide valid `brokerServicePortTls` and `webServicePortTls`, and also need set `tlsEnabledWithKeyStore` to `true` for using KeyStore type configuration. +Besides this, KeyStore path, KeyStore password, TrustStore path, and TrustStore password need to provided. +And since broker will create internal client/admin client to communicate with other brokers, user also need to provide config for them, this is similar to how user config the outside client/admin-client. +If `tlsRequireTrustedClientCertOnConnect` is `true`, broker will reject the Connection if the Client Certificate is not trusted. + +The following TLS configs are needed on the broker side: + +```properties + +tlsEnabledWithKeyStore=true +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw + +``` + +NOTE: it is important to restrict access to the store files via filesystem permissions. + +If you have configured TLS on the broker, to disable non-TLS ports, you can set the values of the following configurations to empty as below. + +``` + +brokerServicePort= +webServicePort= + +``` + +In this case, you need to set the following configurations. + +```conf + +brokerClientTlsEnabled=true // Set this to true +brokerClientTlsEnabledWithKeyStore=true // Set this to true +brokerClientTlsTrustStore= // Set this to your desired value +brokerClientTlsTrustStorePassword= // Set this to your desired value + +``` + +Optional settings that may worth consider: + +1. tlsClientAuthentication=false: Enable/Disable using TLS for authentication. This config when enabled will authenticate the other end + of the communication channel. It should be enabled on both brokers and clients for mutual TLS. +2. tlsCiphers=[TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256], A cipher suite is a named combination of authentication, encryption, MAC and key exchange + algorithm used to negotiate the security settings for a network connection using TLS network protocol. By default, + it is null. [OpenSSL Ciphers](https://www.openssl.org/docs/man1.0.2/apps/ciphers.html) + [JDK Ciphers](http://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#ciphersuites) +3. tlsProtocols=[TLSv1.3,TLSv1.2] (list out the TLS protocols that you are going to accept from clients). + By default, it is not set. +### Configuring Clients + +This is similar to [TLS encryption configuing for client with PEM type](security-tls-transport.md#client-configuration). +For a minimal configuration, you need to provide the TrustStore information. + +For example: +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + + ``` + +1. for java admin client + + ```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + + ``` + +> **Note:** Please configure `tlsTrustStorePath` when you set `useKeyStoreTls` to `true`. + +## TLS authentication with KeyStore configure + +This similar to [TLS authentication with PEM type](security-tls-authentication.md) + +### broker authentication config + +`broker.conf` + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# this should be the CN for one of client keystore. +superUserRoles=admin + +# Enable KeyStore type +tlsEnabledWithKeyStore=true +requireTrustedClientCertOnConnect=true + +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw +# internal auth config +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls +brokerClientAuthenticationParameters={"keyStoreType":"JKS","keyStorePath":"/var/private/tls/client.keystore.jks","keyStorePassword":"clientpw"} +# currently websocket not support keystore type +webSocketServiceEnabled=false + +``` + +### client authentication configuring + +Besides the TLS encryption configuring. The main work is configuring the KeyStore, which contains a valid CN as client role, for client. + +For example: +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls + authParams={"keyStoreType":"JKS","keyStorePath":"/path/to/keystorefile","keyStorePassword":"keystorepw"} + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +1. for java admin client + + ```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +> **Note:** Please configure `tlsTrustStorePath` when you set `useKeyStoreTls` to `true`. + +## Enabling TLS Logging + +You can enable TLS debug logging at the JVM level by starting the brokers and/or clients with `javax.net.debug` system property. For example: + +```shell + +-Djavax.net.debug=all + +``` + +You can find more details on this in [Oracle documentation](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html) on [debugging SSL/TLS connections](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html). diff --git a/site2/website/versioned_docs/version-2.10.x/security-tls-transport.md b/site2/website/versioned_docs/version-2.10.x/security-tls-transport.md new file mode 100644 index 0000000000000..c3fc81e7393ad --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-tls-transport.md @@ -0,0 +1,313 @@ +--- +id: security-tls-transport +title: Transport Encryption using TLS +sidebar_label: "Transport Encryption using TLS" +original_id: security-tls-transport +--- + +## TLS overview + +By default, Apache Pulsar clients communicate with the Apache Pulsar service in plain text. This means that all data is sent in the clear. You can use TLS to encrypt this traffic to protect the traffic from the snooping of a man-in-the-middle attacker. + +You can also configure TLS for both encryption and authentication. Use this guide to configure just TLS transport encryption and refer to [here](security-tls-authentication.md) for TLS authentication configuration. Alternatively, you can use [another authentication mechanism](security-athenz.md) on top of TLS transport encryption. + +> Note that enabling TLS may impact the performance due to encryption overhead. + +## TLS concepts + +TLS is a form of [public key cryptography](https://en.wikipedia.org/wiki/Public-key_cryptography). Using key pairs consisting of a public key and a private key can perform the encryption. The public key encrpyts the messages and the private key decrypts the messages. + +To use TLS transport encryption, you need two kinds of key pairs, **server key pairs** and a **certificate authority**. + +You can use a third kind of key pair, **client key pairs**, for [client authentication](security-tls-authentication.md). + +You should store the **certificate authority** private key in a very secure location (a fully encrypted, disconnected, air gapped computer). As for the certificate authority public key, the **trust cert**, you can freely shared it. + +For both client and server key pairs, the administrator first generates a private key and a certificate request, then uses the certificate authority private key to sign the certificate request, finally generates a certificate. This certificate is the public key for the server/client key pair. + +For TLS transport encryption, the clients can use the **trust cert** to verify that the server has a key pair that the certificate authority signed when the clients are talking to the server. A man-in-the-middle attacker does not have access to the certificate authority, so they couldn't create a server with such a key pair. + +For TLS authentication, the server uses the **trust cert** to verify that the client has a key pair that the certificate authority signed. The common name of the **client cert** is then used as the client's role token (see [Overview](security-overview.md)). + +`Bouncy Castle Provider` provides cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +## Create TLS certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [server certificate](#server-certificate), and [client certificate](#client-certificate). + +Follow the guide below to set up a certificate authority. You can also refer to plenty of resources on the internet for more details. We recommend [this guide](https://jamielinux.com/docs/openssl-certificate-authority/index.html) for your detailed reference. + +### Certificate authority + +1. Create the certificate for the CA. You can use CA to sign both the broker and client certificates. This ensures that each party will trust the others. You should store CA in a very secure location (ideally completely disconnected from networks, air gapped, and fully encrypted). + +2. Entering the following command to create a directory for your CA, and place [this openssl configuration file](https://github.com/apache/pulsar/tree/master/site2/website/static/examples/openssl.cnf) in the directory. You may want to modify the default answers for company name and department in the configuration file. Export the location of the CA directory to the environment variable, CA_HOME. The configuration file uses this environment variable to find the rest of the files and directories that the CA needs. + +```bash + +mkdir my-ca +cd my-ca +wget https://raw.githubusercontent.com/apache/pulsar-site/main/site2/website/static/examples/openssl.cnf +export CA_HOME=$(pwd) + +``` + +3. Enter the commands below to create the necessary directories, keys and certs. + +```bash + +mkdir certs crl newcerts private +chmod 700 private/ +touch index.txt +echo 1000 > serial +openssl genrsa -aes256 -out private/ca.key.pem 4096 +# You need enter a password in the command above +chmod 400 private/ca.key.pem +openssl req -config openssl.cnf -key private/ca.key.pem \ + -new -x509 -days 7300 -sha256 -extensions v3_ca \ + -out certs/ca.cert.pem +# You must enter the same password in the previous openssl command +chmod 444 certs/ca.cert.pem + +``` + +:::tip + +The default `openssl` on macOS doesn't work for the commands above. You must upgrade the `openssl` via Homebrew: + +```bash + +brew install openssl +export PATH="/usr/local/Cellar/openssl@3/3.0.1/bin:$PATH" + +``` + +The version `3.0.1` might change in the future. Use the actual path from the output of `brew install` command. + +::: + +4. After you answer the question prompts, CA-related files are stored in the `./my-ca` directory. Within that directory: + +* `certs/ca.cert.pem` is the public certificate. This public certificates is meant to be distributed to all parties involved. +* `private/ca.key.pem` is the private key. You only need it when you are signing a new certificate for either broker or clients and you must safely guard this private key. + +### Server certificate + +Once you have created a CA certificate, you can create certificate requests and sign them with the CA. + +The following commands ask you a few questions and then create the certificates. When you are asked for the common name, you should match the hostname of the broker. You can also use a wildcard to match a group of broker hostnames, for example, `*.broker.usw.example.com`. This ensures that multiple machines can reuse the same certificate. + +:::tip + +Sometimes matching the hostname is not possible or makes no sense, +such as when you create the brokers with random hostnames, or you +plan to connect to the hosts via their IP. In these cases, you +should configure the client to disable TLS hostname verification. For more +details, you can see [the host verification section in client configuration](#hostname-verification). + +::: + +1. Enter the command below to generate the key. + +```bash + +openssl genrsa -out broker.key.pem 2048 + +``` + +The broker expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so enter the following command to convert it. + +```bash + +openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in broker.key.pem -out broker.key-pk8.pem -nocrypt + +``` + +2. Enter the following command to generate the certificate request. + +```bash + +openssl req -config openssl.cnf \ + -key broker.key.pem -new -sha256 -out broker.csr.pem + +``` + +3. Sign it with the certificate authority by entering the command below. + +```bash + +openssl ca -config openssl.cnf -extensions server_cert \ + -days 1000 -notext -md sha256 \ + -in broker.csr.pem -out broker.cert.pem + +``` + +At this point, you have a cert, `broker.cert.pem`, and a key, `broker.key-pk8.pem`, which you can use along with `ca.cert.pem` to configure TLS transport encryption for your broker and proxy nodes. + +## Configure broker + +To configure a Pulsar [broker](reference-terminology.md#broker) to use TLS transport encryption, you need to make some changes to `broker.conf`, which locates in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties + +brokerServicePortTls=6651 +webServicePortTls=8081 +tlsRequireTrustedClientCertOnConnect=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +> You can find a full list of parameters available in the `conf/broker.conf` file, +> as well as the default values for those parameters, in [Broker Configuration](reference-configuration.md#broker) +> +### TLS Protocol Version and Cipher + +You can configure the broker (and proxy) to require specific TLS protocol versions and ciphers for TLS negiotation. You can use the TLS protocol versions and ciphers to stop clients from requesting downgraded TLS protocol versions or ciphers that may have weaknesses. + +Both the TLS protocol versions and cipher properties can take multiple values, separated by commas. The possible values for protocol version and ciphers depend on the TLS provider that you are using. Pulsar uses OpenSSL if the OpenSSL is available, but if the OpenSSL is not available, Pulsar defaults back to the JDK implementation. + +```properties + +tlsProtocols=TLSv1.3,TLSv1.2 +tlsCiphers=TLS_DH_RSA_WITH_AES_256_GCM_SHA384,TLS_DH_RSA_WITH_AES_256_CBC_SHA + +``` + +OpenSSL currently supports ```TLSv1.1```, ```TLSv1.2``` and ```TLSv1.3``` for the protocol version. You can acquire a list of supported cipher from the openssl ciphers command, i.e. ```openssl ciphers -tls1_3```. + +For JDK 11, you can obtain a list of supported values from the documentation: +- [TLS protocol](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSEPROVIDERPROTOCOLPARAMETERS-BBF75009) +- [Ciphers](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSE_CIPHER_SUITES) + +## Proxy Configuration + +Proxies need to configure TLS in two directions, for clients connecting to the proxy, and for the proxy connecting to brokers. + +```properties + +# For clients connecting to the proxy +tlsEnabledInProxy=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +# For the proxy to connect to brokers +tlsEnabledWithBroker=true +brokerClientTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +## Client configuration + +When you enable the TLS transport encryption, you need to configure the client to use ```https://``` and port 8443 for the web service URL, and ```pulsar+ssl://``` and port 6651 for the broker service URL. + +As the server certificate that you generated above does not belong to any of the default trust chains, you also need to either specify the path the **trust cert** (recommended), or tell the client to allow untrusted server certs. + +### Hostname verification + +Hostname verification is a TLS security feature whereby a client can refuse to connect to a server if the "CommonName" does not match the hostname to which the hostname is connecting. By default, Pulsar clients disable hostname verification, as it requires that each broker has a DNS record and a unique cert. + +Moreover, as the administrator has full control of the certificate authority, a bad actor is unlikely to be able to pull off a man-in-the-middle attack. "allowInsecureConnection" allows the client to connect to servers whose cert has not been signed by an approved CA. The client disables "allowInsecureConnection" by default, and you should always disable "allowInsecureConnection" in production environments. As long as you disable "allowInsecureConnection", a man-in-the-middle attack requires that the attacker has access to the CA. + +One scenario where you may want to enable hostname verification is where you have multiple proxy nodes behind a VIP, and the VIP has a DNS record, for example, pulsar.mycompany.com. In this case, you can generate a TLS cert with pulsar.mycompany.com as the "CommonName," and then enable hostname verification on the client. + +The examples below show that hostname verification is disabled for the CLI tools/Java/Python/C++/Node.js/C# clients by default. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools.md#pulsar-admin), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS transport with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +tlsEnableHostnameVerification=false + +``` + +#### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .enableTlsHostnameVerification(false) // false by default, in any case + .allowTlsInsecureConnection(false) // false by default, in any case + .build(); + +``` + +#### Python client + +```python + +from pulsar import Client + +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_hostname_verification=False, + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False) // defaults to false from v2.2.0 onwards + +``` + +#### C++ client + +```c++ + +#include + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); // shouldn't be needed soon +config.setTlsTrustCertsFilePath(caPath); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create(clientPublicKeyPath, clientPrivateKeyPath)); +config.setValidateHostName(false); + +``` + +#### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + useTls: true, + tlsValidateHostname: false, + tlsAllowInsecureConnection: false, + }); +})(); + +``` + +#### C# client + +```c# + +var certificate = new X509Certificate2("ca.cert.pem"); +var client = PulsarClient.Builder() + .TrustedCertificateAuthority(certificate) //If the CA is not trusted on the host, you can add it explicitly. + .VerifyCertificateAuthority(true) //Default is 'true' + .VerifyCertificateName(false) //Default is 'false' + .Build(); + +``` + +> Note that `VerifyCertificateName` refers to the configuration of hostname verification in the C# client. diff --git a/site2/website/versioned_docs/version-2.10.x/security-token-admin.md b/site2/website/versioned_docs/version-2.10.x/security-token-admin.md new file mode 100644 index 0000000000000..a265f6320d28f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/security-token-admin.md @@ -0,0 +1,183 @@ +--- +id: security-token-admin +title: Token authentication admin +sidebar_label: "Token authentication admin" +original_id: security-token-admin +--- + +## Token Authentication Overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +Tokens are used to identify a Pulsar client and associate with some "principal" (or "role") which +will be then granted permissions to do some actions (eg: publish or consume from a topic). + +A user will typically be given a token string by an administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like: + +``` + + eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application will specify the token when creating the client instance. An alternative is to pass +a "token supplier", that is to say a function that returns the token when the client library +will need one. + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. It is strongly recommended to +> always use TLS encryption when talking to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) + +## Secret vs Public/Private keys + +JWT support two different kind of keys in order to generate and validate the tokens: + + * Symmetric : + - there is a single ***Secret*** key that is used both to generate and validate + * Asymmetric: there is a pair of keys. + - ***Private*** key is used to generate tokens + - ***Public*** key is used to validate tokens + +### Secret key + +When using a secret key, the administrator will create the key and he will +use it to generate the client tokens. This key will be also configured to +the brokers to allow them to validate the clients. + +#### Creating a secret key + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +To generate base64 encoded private key + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Public/Private keys + +With public/private, we need to create a pair of keys. Pulsar supports all algorithms supported by the Java JWT library shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys) + +#### Creating a key pair + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * `my-private.key` will be stored in a safe location and only used by administrator to generate + new tokens. + * `my-public.key` will be distributed to all Pulsar brokers. This file can be publicly shared without + any security concern. + +## Generating tokens + +A token is the credential associated with a user. The association is done through the "principal", +or "role". In case of JWT tokens, this field it's typically referred to as **subject**, though +it's exactly the same concept. + +The generated token is then required to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This will print the token string on stdout. + +Similarly, one can create a token by passing the "private" key: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, a token can also be created with a pre-defined TTL. After that time, +the token will be automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +## Authorization + +The token itself doesn't have any permission associated. That will be determined by the +authorization engine. Once the token is created, one can grant permission for this token to do certain +actions. Eg. : + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +## Enabling Token Authentication ... + +### ... on Brokers + +To configure brokers to authenticate clients, put the following in `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### ... on Proxies + +To configure proxies to authenticate clients, put the following in `proxy.conf`: + +The proxy will have its own token used when talking to brokers. The role token for this +key pair should be configured in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Or, alternatively, read token from file +# brokerClientAuthenticationParameters=file:///path/to/proxy-token.txt + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/sql-deployment-configurations.md b/site2/website/versioned_docs/version-2.10.x/sql-deployment-configurations.md new file mode 100644 index 0000000000000..0178ca41e017c --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/sql-deployment-configurations.md @@ -0,0 +1,277 @@ +--- +id: sql-deployment-configurations +title: Pulsar SQL configuration and deployment +sidebar_label: "Configuration and deployment" +original_id: sql-deployment-configurations +--- + +You can configure Presto Pulsar connector and deploy a cluster with the following instruction. + +## Configure Presto Pulsar Connector +You can configure Presto Pulsar Connector in the `${project.root}/conf/presto/catalog/pulsar.properties` properties file. The configuration for the connector and the default values are as follows. + +```properties + +# name of the connector to be displayed in the catalog +connector.name=pulsar + +# the url of Pulsar broker service +pulsar.web-service-url=http://localhost:8080 + +# URI of Zookeeper cluster +pulsar.zookeeper-uri=localhost:2181 + +# minimum number of entries to read at a single time +pulsar.entry-read-batch-size=100 + +# default number of splits to use per query +pulsar.target-num-splits=4 + +# max size of one batch message (default value is 5MB) +pulsar.max-message-size=5242880 + +# number of split used when querying data from pulsar +pulsar.target-num-splits=2 + +# size of queue to buffer entry read from pulsar +pulsar.max-split-entry-queue-size=1000 + +# size of queue to buffer message extract from entries +pulsar.max-split-message-queue-size=10000 + +# status provider to record connector metrics +pulsar.stats-provider=org.apache.bookkeeper.stats.NullStatsProvider + +# config in map format for stats provider e.g. {"key1":"val1","key2":"val2"} +pulsar.stats-provider-configs={} + +# whether to rewrite Pulsar's default topic delimiter '/' +pulsar.namespace-delimiter-rewrite-enable=false + +# delimiter used to rewrite Pulsar's default delimiter '/', use if default is causing incompatibility with other system like Superset +pulsar.rewrite-namespace-delimiter=“/” + +# maximum number of thread pool size for ledger offloader. +pulsar.managed-ledger-offload-max-threads=2 + +# driver used to offload or read cold data to or from long-term storage +pulsar.managed-ledger-offload-driver=null + +# directory to load offloaders nar file. +pulsar.offloaders-directory="./offloaders" + +# properties and configurations related to specific offloader implementation as map e.g. {"key1":"val1","key2":"val2"} +pulsar.offloader-properties={} + +# authentication plugin used to authenticate to Pulsar cluster +pulsar.auth-plugin=null + +# authentication parameter used to authenticate to the Pulsar cluster as a string e.g. "key1:val1,key2:val2". +pulsar.auth-params=null + +# whether the Pulsar client accept an untrusted TLS certificate from broker +pulsar.tls-allow-insecure-connection=null + +# whether to allow hostname verification when a client connects to broker over TLS. +pulsar.tls-hostname-verification-enable=null + +# path for the trusted TLS certificate file of Pulsar broker +pulsar.tls-trust-cert-file-path=null + +# set the threshold for BookKeeper request throttle, default is disabled +pulsar.bookkeeper-throttle-value=0 + +# set the number of IO thread +pulsar.bookkeeper-num-io-threads=2 * Runtime.getRuntime().availableProcessors() + +# set the number of worker thread +pulsar.bookkeeper-num-worker-threads=Runtime.getRuntime().availableProcessors() + +# whether to use BookKeeper V2 wire protocol +pulsar.bookkeeper-use-v2-protocol=true + +# interval to check the need for sending an explicit LAC, default is disabled +pulsar.bookkeeper-explicit-interval=0 + +# size for managed ledger entry cache (in MB). +pulsar.managed-ledger-cache-size-MB=0 + +# number of threads to be used for managed ledger tasks dispatching +pulsar.managed-ledger-num-worker-threads=Runtime.getRuntime().availableProcessors() + +# number of threads to be used for managed ledger scheduled tasks +pulsar.managed-ledger-num-scheduler-threads=Runtime.getRuntime().availableProcessors() + +# directory used to store extraction NAR file +pulsar.nar-extraction-directory=System.getProperty("java.io.tmpdir") + +``` + +You can connect Presto to a Pulsar cluster with multiple hosts. To configure multiple hosts for brokers, add multiple URLs to `pulsar.web-service-url`. To configure multiple hosts for ZooKeeper, add multiple URIs to `pulsar.zookeeper-uri`. The following is an example. + +``` + +pulsar.web-service-url=http://localhost:8080,localhost:8081,localhost:8082 +pulsar.zookeeper-uri=localhost1,localhost2:2181 + +``` + +**Note: by default, Pulsar SQL does not get the last message in a topic**. It is by design and controlled by settings. By default, BookKeeper LAC only advances when subsequent entries are added. If there is no subsequent entry added, the last written entry is not visible to readers until the ledger is closed. This is not a problem for Pulsar which uses managed ledger, but Pulsar SQL directly reads from BookKeeper ledger. + +If you want to get the last message in a topic, set the following configurations: + +1. For the broker configuration, set `bookkeeperExplicitLacIntervalInMills` > 0 in `broker.conf` or `standalone.conf`. + +2. For the Presto configuration, set `pulsar.bookkeeper-explicit-interval` > 0 and `pulsar.bookkeeper-use-v2-protocol=false`. + +However, using BookKeeper V3 protocol introduces additional GC overhead to BK as it uses Protobuf. + +## Query data from existing Presto clusters + +If you already have a Presto cluster, you can copy the Presto Pulsar connector plugin to your existing cluster. Download the archived plugin package with the following command. + +```bash + +$ wget pulsar:binary_release_url + +``` + +## Deploy a new cluster + +Since Pulsar SQL is powered by [Trino (formerly Presto SQL)](https://trino.io), the configuration for deployment is the same for the Pulsar SQL worker. + +:::note + +For how to set up a standalone single node environment, refer to [Query data](sql-getting-started.md). + +::: + +You can use the same CLI args as the Presto launcher. + +```bash + +$ ./bin/pulsar sql-worker --help +Usage: launcher [options] command + +Commands: run, start, stop, restart, kill, status + +Options: + -h, --help show this help message and exit + -v, --verbose Run verbosely + --etc-dir=DIR Defaults to INSTALL_PATH/etc + --launcher-config=FILE + Defaults to INSTALL_PATH/bin/launcher.properties + --node-config=FILE Defaults to ETC_DIR/node.properties + --jvm-config=FILE Defaults to ETC_DIR/jvm.config + --config=FILE Defaults to ETC_DIR/config.properties + --log-levels-file=FILE + Defaults to ETC_DIR/log.properties + --data-dir=DIR Defaults to INSTALL_PATH + --pid-file=FILE Defaults to DATA_DIR/var/run/launcher.pid + --launcher-log-file=FILE + Defaults to DATA_DIR/var/log/launcher.log (only in + daemon mode) + --server-log-file=FILE + Defaults to DATA_DIR/var/log/server.log (only in + daemon mode) + -D NAME=VALUE Set a Java system property + +``` + +The default configuration for the cluster is located in `${project.root}/conf/presto`. You can customize your deployment by modifying the default configuration. + +You can set the worker to read from a different configuration directory, or set a different directory to write data. + +```bash + +$ ./bin/pulsar sql-worker run --etc-dir /tmp/incubator-pulsar/conf/presto --data-dir /tmp/presto-1 + +``` + +You can start the worker as daemon process. + +```bash + +$ ./bin/pulsar sql-worker start + +``` + +### Deploy a cluster on multiple nodes + +You can deploy a Pulsar SQL cluster or Presto cluster on multiple nodes. The following example shows how to deploy a cluster on three-node cluster. + +1. Copy the Pulsar binary distribution to three nodes. + +The first node runs as Presto coordinator. The minimal configuration requirement in the `${project.root}/conf/presto/config.properties` file is as follows. + +```properties + +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery-server.enabled=true +discovery.uri= + +``` + +The other two nodes serve as worker nodes, you can use the following configuration for worker nodes. + +```properties + +coordinator=false +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery.uri= + +``` + +2. Modify `pulsar.web-service-url` and `pulsar.zookeeper-uri` configuration in the `${project.root}/conf/presto/catalog/pulsar.properties` file accordingly for the three nodes. + +3. Start the coordinator node. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +4. Start worker nodes. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +5. Start the SQL CLI and check the status of your cluster. + +```bash + +$ ./bin/pulsar sql --server + +``` + +6. Check the status of your nodes. + +```bash + +presto> SELECT * FROM system.runtime.nodes; + node_id | http_uri | node_version | coordinator | state +---------+-------------------------+--------------+-------------+-------- + 1 | http://192.168.2.1:8081 | testversion | true | active + 3 | http://192.168.2.2:8081 | testversion | false | active + 2 | http://192.168.2.3:8081 | testversion | false | active + +``` + +For more information about deployment in Presto, refer to [Presto deployment](https://trino.io/docs/current/installation/deployment.html). + +:::note + +The broker does not advance LAC, so when Pulsar SQL bypass broker to query data, it can only read entries up to the LAC that all the bookies learned. You can enable periodically write LAC on the broker by setting "bookkeeperExplicitLacIntervalInMills" in the broker.conf. + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/sql-getting-started.md b/site2/website/versioned_docs/version-2.10.x/sql-getting-started.md new file mode 100644 index 0000000000000..8a5cd7199b365 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/sql-getting-started.md @@ -0,0 +1,187 @@ +--- +id: sql-getting-started +title: Query data with Pulsar SQL +sidebar_label: "Query data" +original_id: sql-getting-started +--- + +Before querying data in Pulsar, you need to install Pulsar and built-in connectors. + +## Requirements +1. Install [Pulsar](getting-started-standalone.md#install-pulsar-standalone). +2. Install Pulsar [built-in connectors](getting-started-standalone.md#install-builtin-connectors-optional). + +## Query data in Pulsar +To query data in Pulsar with Pulsar SQL, complete the following steps. + +1. Start a Pulsar standalone cluster. + +```bash + +./bin/pulsar standalone + +``` + +2. Start a Pulsar SQL worker. + +```bash + +./bin/pulsar sql-worker run + +``` + +3. After initializing Pulsar standalone cluster and the SQL worker, run SQL CLI. + +```bash + +./bin/pulsar sql + +``` + +4. Test with SQL commands. + +```bash + +presto> show catalogs; + Catalog +--------- + pulsar + system +(2 rows) + +Query 20180829_211752_00004_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + + +presto> show schemas in pulsar; + Schema +----------------------- + information_schema + public/default + public/functions + sample/standalone/ns1 +(4 rows) + +Query 20180829_211818_00005_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [4 rows, 89B] [21 rows/s, 471B/s] + + +presto> show tables in pulsar."public/default"; + Table +------- +(0 rows) + +Query 20180829_211839_00006_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + +``` + +Since there is no data in Pulsar, no records is returned. + +5. Start the built-in connector _DataGeneratorSource_ and ingest some mock data. + +```bash + +./bin/pulsar-admin sources create --name generator --destinationTopicName generator_test --source-type data-generator + +``` + +And then you can query a topic in the namespace "public/default". + +```bash + +presto> show tables in pulsar."public/default"; + Table +---------------- + generator_test +(1 row) + +Query 20180829_213202_00000_csyeu, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:02 [1 rows, 38B] [0 rows/s, 17B/s] + +``` + +You can now query the data within the topic "generator_test". + +```bash + +presto> select * from pulsar."public/default".generator_test; + + firstname | middlename | lastname | email | username | password | telephonenumber | age | companyemail | nationalidentitycardnumber | +-------------+-------------+-------------+----------------------------------+--------------+----------+-----------------+-----+-----------------------------------------------+----------------------------+ + Genesis | Katherine | Wiley | genesis.wiley@gmail.com | genesisw | y9D2dtU3 | 959-197-1860 | 71 | genesis.wiley@interdemconsulting.eu | 880-58-9247 | + Brayden | | Stanton | brayden.stanton@yahoo.com | braydens | ZnjmhXik | 220-027-867 | 81 | brayden.stanton@supermemo.eu | 604-60-7069 | + Benjamin | Julian | Velasquez | benjamin.velasquez@yahoo.com | benjaminv | 8Bc7m3eb | 298-377-0062 | 21 | benjamin.velasquez@hostesltd.biz | 213-32-5882 | + Michael | Thomas | Donovan | donovan@mail.com | michaeld | OqBm9MLs | 078-134-4685 | 55 | michael.donovan@memortech.eu | 443-30-3442 | + Brooklyn | Avery | Roach | brooklynroach@yahoo.com | broach | IxtBLafO | 387-786-2998 | 68 | brooklyn.roach@warst.biz | 085-88-3973 | + Skylar | | Bradshaw | skylarbradshaw@yahoo.com | skylarb | p6eC6cKy | 210-872-608 | 96 | skylar.bradshaw@flyhigh.eu | 453-46-0334 | +. +. +. + +``` + +You can query the mock data. + +## Query your own data +If you want to query your own data, you need to ingest your own data first. You can write a simple producer and write custom defined data to Pulsar. The following is an example. + +```java + +public class TestProducer { + + public static class Foo { + private int field1 = 1; + private String field2; + private long field3; + + public Foo() { + } + + public int getField1() { + return field1; + } + + public void setField1(int field1) { + this.field1 = field1; + } + + public String getField2() { + return field2; + } + + public void setField2(String field2) { + this.field2 = field2; + } + + public long getField3() { + return field3; + } + + public void setField3(long field3) { + this.field3 = field3; + } + } + + public static void main(String[] args) throws Exception { + PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + Producer producer = pulsarClient.newProducer(AvroSchema.of(Foo.class)).topic("test_topic").create(); + + for (int i = 0; i < 1000; i++) { + Foo foo = new Foo(); + foo.setField1(i); + foo.setField2("foo" + i); + foo.setField3(System.currentTimeMillis()); + producer.newMessage().value(foo).send(); + } + producer.close(); + pulsarClient.close(); + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/sql-overview.md b/site2/website/versioned_docs/version-2.10.x/sql-overview.md new file mode 100644 index 0000000000000..8ba19d053003d --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/sql-overview.md @@ -0,0 +1,18 @@ +--- +id: sql-overview +title: Pulsar SQL Overview +sidebar_label: "Overview" +original_id: sql-overview +--- + +Apache Pulsar is used to store streams of event data, and the event data is structured with predefined fields. With the implementation of the [Schema Registry](schema-get-started.md), you can store structured data in Pulsar and query the data by using [Trino (formerly Presto SQL)](https://trino.io/). + +As the core of Pulsar SQL, Presto Pulsar connector enables Presto workers within a Presto cluster to query data from Pulsar. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-2.png) + +The query performance is efficient and highly scalable, because Pulsar adopts [two level segment based architecture](concepts-architecture-overview.md#apache-bookkeeper). + +Topics in Pulsar are stored as segments in [Apache BookKeeper](https://bookkeeper.apache.org/). Each topic segment is replicated to some BookKeeper nodes, which enables concurrent reads and high read throughput. You can configure the number of BookKeeper nodes, and the default number is `3`. In Presto Pulsar connector, data is read directly from BookKeeper, so Presto workers can read concurrently from horizontally scalable number BookKeeper nodes. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-1.png) diff --git a/site2/website/versioned_docs/version-2.10.x/sql-rest-api.md b/site2/website/versioned_docs/version-2.10.x/sql-rest-api.md new file mode 100644 index 0000000000000..c92fd62f7d870 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/sql-rest-api.md @@ -0,0 +1,192 @@ +--- +id: sql-rest-api +title: Pulsar SQL REST APIs +sidebar_label: "REST APIs" +original_id: sql-rest-api +--- + +This section lists resources that make up the Presto REST API v1. + +## Request for Presto services + +All requests for Presto services should use Presto REST API v1 version. + +To request services, use explicit URL `http://presto.service:8081/v1`. You need to update `presto.service:8081` with your real Presto address before sending requests. + +`POST` requests require the `X-Presto-User` header. If you use authentication, you must use the same `username` that is specified in the authentication configuration. If you do not use authentication, you can specify anything for `username`. + +```properties + +X-Presto-User: username + +``` + +For more information about headers, refer to [PrestoHeaders](https://github.com/trinodb/trino). + +## Schema + +You can use statement in the HTTP body. All data is received as JSON document that might contain a `nextUri` link. If the received JSON document contains a `nextUri` link, the request continues with the `nextUri` link until the received data does not contain a `nextUri` link. If no error is returned, the query completes successfully. If an `error` field is displayed in `stats`, it means the query fails. + +The following is an example of `show catalogs`. The query continues until the received JSON document does not contain a `nextUri` link. Since no `error` is displayed in `stats`, it means that the query completes successfully. + +```powershell + +➜ ~ curl --header "X-Presto-User: test-user" --request POST --data 'show catalogs' http://localhost:8081/v1/statement +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "stats" : { + "queued" : true, + "nodes" : 0, + "userTimeMillis" : 0, + "cpuTimeMillis" : 0, + "wallTimeMillis" : 0, + "processedBytes" : 0, + "processedRows" : 0, + "runningSplits" : 0, + "queuedTimeMillis" : 0, + "queuedSplits" : 0, + "completedSplits" : 0, + "totalSplits" : 0, + "scheduled" : false, + "peakMemoryBytes" : 0, + "state" : "QUEUED", + "elapsedTimeMillis" : 0 + }, + "id" : "20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1" +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1 +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2", + "id" : "20191113_033653_00006_dg6hb", + "stats" : { + "state" : "PLANNING", + "totalSplits" : 0, + "queued" : false, + "userTimeMillis" : 0, + "completedSplits" : 0, + "scheduled" : false, + "wallTimeMillis" : 0, + "runningSplits" : 0, + "queuedSplits" : 0, + "cpuTimeMillis" : 0, + "processedRows" : 0, + "processedBytes" : 0, + "nodes" : 0, + "queuedTimeMillis" : 1, + "elapsedTimeMillis" : 2, + "peakMemoryBytes" : 0 + } +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2 +{ + "id" : "20191113_033653_00006_dg6hb", + "data" : [ + [ + "pulsar" + ], + [ + "system" + ] + ], + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "columns" : [ + { + "typeSignature" : { + "rawType" : "varchar", + "arguments" : [ + { + "kind" : "LONG_LITERAL", + "value" : 6 + } + ], + "literalArguments" : [], + "typeArguments" : [] + }, + "name" : "Catalog", + "type" : "varchar(6)" + } + ], + "stats" : { + "wallTimeMillis" : 104, + "scheduled" : true, + "userTimeMillis" : 14, + "progressPercentage" : 100, + "totalSplits" : 19, + "nodes" : 1, + "cpuTimeMillis" : 16, + "queued" : false, + "queuedTimeMillis" : 1, + "state" : "FINISHED", + "peakMemoryBytes" : 0, + "elapsedTimeMillis" : 111, + "processedBytes" : 0, + "processedRows" : 0, + "queuedSplits" : 0, + "rootStage" : { + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1, + "subStages" : [ + { + "cpuTimeMillis" : 14, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 17, + "subStages" : [ + { + "wallTimeMillis" : 7, + "subStages" : [], + "stageId" : "2", + "done" : true, + "nodes" : 1, + "totalSplits" : 1, + "processedBytes" : 22, + "processedRows" : 2, + "queuedSplits" : 0, + "userTimeMillis" : 1, + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1 + } + ], + "wallTimeMillis" : 92, + "nodes" : 1, + "done" : true, + "stageId" : "1", + "userTimeMillis" : 12, + "processedRows" : 2, + "processedBytes" : 51, + "queuedSplits" : 0, + "totalSplits" : 17 + } + ], + "wallTimeMillis" : 5, + "done" : true, + "nodes" : 1, + "stageId" : "0", + "userTimeMillis" : 1, + "processedRows" : 2, + "processedBytes" : 22, + "totalSplits" : 1, + "queuedSplits" : 0 + }, + "runningSplits" : 0, + "completedSplits" : 19 + } +} + +``` + +:::note + +Since the response data is not in sync with the query state from the perspective of clients, you cannot rely on the response data to determine whether the query completes. + +::: + +For more information about Presto REST API, refer to [Presto HTTP Protocol](https://github.com/prestosql/presto/wiki/HTTP-Protocol). diff --git a/site2/website/versioned_docs/version-2.10.x/standalone.md b/site2/website/versioned_docs/version-2.10.x/standalone.md new file mode 100644 index 0000000000000..0a10f3ae21369 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/standalone.md @@ -0,0 +1,268 @@ +--- +id: standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary [RocksDB](http://rocksdb.org/) and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> **Pulsar in production?** +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of installing Pulsar locally. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker) and more.
    **Note:** Pulsar standalone uses RocksDB as the local metadata store and its configuration file path [`metadataStoreConfigPath`](reference-configuration.md) is configurable in the `standalone.conf` file. For more information about the configurations of RocksDB, see [here](https://github.com/facebook/rocksdb/blob/main/examples/rocksdb_option_file_example.ini) and related [documentation](https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide). +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by RocksDB and BookKeeper. +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +- Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +- To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or DC/OS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +21:59:29.327 [DLM-/stream/storage-OrderedScheduler-3-0] INFO org.apache.bookkeeper.stream.storage.impl.sc.StorageContainerImpl - Successfully started storage container (0). +21:59:34.576 [main] INFO org.apache.pulsar.broker.authentication.AuthenticationService - Authentication is disabled +21:59:34.576 [main] INFO org.apache.pulsar.websocket.WebSocketService - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `bin/pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](reference-cli-tools.md#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:17:16.781 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully consumed + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:21:08.693 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `bin/pulsar-daemon start standalone` command, then use the `bin/pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](reference-cli-tools.md#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-aliyun.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-aliyun.md new file mode 100644 index 0000000000000..89dc53cda7604 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-aliyun.md @@ -0,0 +1,257 @@ +--- +id: tiered-storage-aliyun +title: Use Aliyun OSS offloader with Pulsar +sidebar_label: "Aliyun OSS offloader" +original_id: tiered-storage-aliyun +--- + +This chapter guides you through every step of installing and configuring the Aliyun Object Storage Service (OSS) offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Aliyun OSS offloader. + +### Prerequisite + +- Pulsar: 2.8.0 or later versions + +### Step + +This example uses Pulsar 2.8.0. + +1. Download the Pulsar tarball, see [here](standalone.md#install-pulsar-using-binary-release). + +2. Download and untar the Pulsar offloaders package, then copy the Pulsar offloaders as `offloaders` in the Pulsar directory, see [here](standalone.md#install-tiered-storage-offloaders-optional). + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/), [Azure](https://portal.azure.com/#home), and [Aliyun OSS](https://www.aliyun.com/product/oss) for long-term storage. + + ``` + + tiered-storage-file-system-2.8.0.nar + tiered-storage-jcloud-2.8.0.nar + + ``` + + :::note + + * If you are running Pulsar in a bare-metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Aliyun OSS, you need to configure some properties of the Aliyun OSS offload driver. + +::: + +Besides, you can also configure the Aliyun OSS offloader to run it automatically or trigger it manually. + +### Configure Aliyun OSS offloader driver + +You can configure the Aliyun OSS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + | Required configuration | Description | Example value | + | --- | --- |--- | + | `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | aliyun-oss | + | `offloadersDirectory` | Offloader directory | offloaders | + | `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload | + | `managedLedgerOffloadServiceEndpoint` | Endpoint | http://oss-cn-hongkong.aliyuncs.com | + +- **Optional** configurations are as below. + + | Optional | Description | Example value | + | --- | --- | --- | + | `managedLedgerOffloadReadBufferSizeInBytes` | Size of block read | 1 MB | + | `managedLedgerOffloadMaxBlockSizeInBytes` | Size of block write | 64 MB | + | `managedLedgerMinLedgerRolloverTimeMinutes` | Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment. | 2 | + | `managedLedgerMaxEntriesPerLedger` | Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment. | 5000 | + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Aliyun OSS must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Endpoint (required) + +The endpoint is the region where a bucket is located. + +:::tip + +For more information about Aliyun OSS regions and endpoints, see [International website](https://www.alibabacloud.com/help/doc-detail/31837.htm) or [Chinese website](https://help.aliyun.com/document_detail/31837.html). + +::: + + +##### Example + +This example sets the endpoint as _oss-us-west-1-internal_. + +``` + +managedLedgerOffloadServiceEndpoint=http://oss-us-west-1-internal.aliyuncs.com + +``` + +#### Authentication (required) + +To be able to access Aliyun OSS, you need to authenticate with Aliyun OSS. + +Set the environment variables `ALIYUN_OSS_ACCESS_KEY_ID` and `ALIYUN_OSS_ACCESS_KEY_SECRET` in `conf/pulsar_env.sh`. + +"export" is important so that the variables are made available in the environment of spawned processes. + +```bash + +export ALIYUN_OSS_ACCESS_KEY_ID=ABC123456789 +export ALIYUN_OSS_ACCESS_KEY_SECRET=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Aliyun OSS in the configuration file `broker.conf` or `standalone.conf`. + +| Configuration | Description | Default value | +| --- | --- | --- | +| `managedLedgerOffloadReadBufferSizeInBytes` | Block size for each individual read when reading back data from Aliyun OSS. | 1 MB | +| `managedLedgerOffloadMaxBlockSizeInBytes` | Maximum size of a "part" sent during a multipart upload to Aliyun OSS. It **cannot** be smaller than 5 MB. | 64 MB | + +### Run Aliyun OSS offloader automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +| Threshold value | Action | +| --- | --- | +| > 0 | It triggers the offloading operation if the topic storage reaches its threshold. | +| = 0 | It causes a broker to offload data as soon as possible. | +| < 0 | It disables automatic offloading operation. | + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, the offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](/tools/pulsar-admin/) command. + +#### Example + +This example sets the Aliyun OSS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + +::: + +### Run Aliyun OSS offloader manually + +For individual topics, you can trigger the Aliyun OSS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Aliyun OSS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Aliyun OSS offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + +- This example checks the Aliyun OSS offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Aliyun OSS offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-aws.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-aws.md new file mode 100644 index 0000000000000..11905bbb09ea4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-aws.md @@ -0,0 +1,329 @@ +--- +id: tiered-storage-aws +title: Use AWS S3 offloader with Pulsar +sidebar_label: "AWS S3 offloader" +original_id: tiered-storage-aws +--- + +This chapter guides you through every step of installing and configuring the AWS S3 offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the AWS S3 offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [downloads page](/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/) and [GCS](https://cloud.google.com/storage/) for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to AWS S3, you need to configure some properties of the AWS S3 offload driver. + +::: + +Besides, you can also configure the AWS S3 offloader to run it automatically or trigger it manually. + +### Configure AWS S3 offloader driver + +You can configure the AWS S3 offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive.

    **Note**: there is a third driver type, S3, which is identical to AWS S3, though S3 requires that you specify an endpoint URL using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if using an S3 compatible data store other than AWS S3. | aws-s3 + `offloadersDirectory` | Offloader directory | offloaders + `s3ManagedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `s3ManagedLedgerOffloadRegion` | Bucket region

    **Note**: before specifying a value for this parameter, you need to set the following configurations. Otherwise, you might get an error.

    - Set [`s3ManagedLedgerOffloadServiceEndpoint`](https://docs.aws.amazon.com/general/latest/gr/s3.html).

    Example
    `s3ManagedLedgerOffloadServiceEndpoint=https://s3.YOUR_REGION.amazonaws.com`

    - Grant `GetBucketLocation` permission to a user.

    For how to grant `GetBucketLocation` permission to a user, see [here](https://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html#using-with-s3-actions-related-to-buckets).| eu-west-3 + `s3ManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in AWS S3 must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region + +A bucket region is a region where a bucket is located. If a bucket region is not specified, the **default** region (`US East (N. Virginia)`) is used. + +:::tip + +For more information about AWS regions and endpoints, see [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). + +::: + + +##### Example + +This example sets the bucket region as _europe-west-3_. + +``` + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication (required) + +To be able to access AWS S3, you need to authenticate with AWS S3. + +Pulsar does not provide any direct methods of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, you can configure credentials using one of the following methods. + +* Use EC2 instance metadata credentials. + + If you are on AWS instance with an instance profile that provides credentials, Pulsar uses these credentials if no other mechanism is provided. + +* Set the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AWS_ACCESS_KEY_ID=ABC123456789 + export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Add the Java system properties `aws.accessKeyId` and `aws.secretKey` to `PULSAR_EXTRA_OPTS` in `conf/pulsar_env.sh`. + + ```bash + + PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacity.default=1000 -Dio.netty.recycler.linkCapacity=1024" + + ``` + +* Set the access credentials in `~/.aws/credentials`. + + ```conf + + [default] + aws_access_key_id=ABC123456789 + aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Assume an IAM role. + + This example uses the `DefaultAWSCredentialsProviderChain` for assuming this role. + + The broker must be rebooted for credentials specified in `pulsar_env` to take effect. + + ```conf + + s3ManagedLedgerOffloadRole= + s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from AWS S3 in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`s3ManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from AWS S3.|1 MB +`s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to AWS S3. It **cannot** be smaller than 5 MB. |64 MB + +### Configure AWS S3 offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](/tools/pulsar-admin/) command. + +#### Example + +This example sets the AWS S3 offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + +::: + +### Configure AWS S3 offloader to run manually + +For individual topics, you can trigger AWS S3 offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to AWS S3 until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the AWS S3 offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + +- This example checks the AWS S3 offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the AWS S3 offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the AWS S3 offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/aws-s3/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-azure.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-azure.md new file mode 100644 index 0000000000000..e65356355ccc2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-azure.md @@ -0,0 +1,264 @@ +--- +id: tiered-storage-azure +title: Use Azure BlobStore offloader with Pulsar +sidebar_label: "Azure BlobStore offloader" +original_id: tiered-storage-azure +--- + +This chapter guides you through every step of installing and configuring the Azure BlobStore offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Azure BlobStore offloader. + +### Prerequisite + +- Pulsar: 2.6.2 or later versions + +### Step + +This example uses Pulsar 2.6.2. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz) + + * Download from the Pulsar [downloads page](/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.6.2/apache-pulsar-offloaders-2.6.2-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.6.2-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.6.2/offloaders apache-pulsar-2.6.2/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/) and [Azure](https://portal.azure.com/#home) for long term storage. + + ``` + + tiered-storage-file-system-2.6.2.nar + tiered-storage-jcloud-2.6.2.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Azure BlobStore, you need to configure some properties of the Azure BlobStore offload driver. + +::: + +Besides, you can also configure the Azure BlobStore offloader to run it automatically or trigger it manually. + +### Configure Azure BlobStore offloader driver + +You can configure the Azure BlobStore offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name | azureblob + `offloadersDirectory` | Offloader directory | offloaders + `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `managedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `managedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Azure BlobStore must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Authentication (required) + +To be able to access Azure BlobStore, you need to authenticate with Azure BlobStore. + +* Set the environment variables `AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AZURE_STORAGE_ACCOUNT=ABC123456789 + export AZURE_STORAGE_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Azure BlobStore in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`managedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from Azure BlobStore store.|1 MB +`managedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to Azure BlobStore store. It **cannot** be smaller than 5 MB. |64 MB + +### Configure Azure BlobStore offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](/tools/pulsar-admin/) command. + +#### Example + +This example sets the Azure BlobStore offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + +::: + +### Configure Azure BlobStore offloader to run manually + +For individual topics, you can trigger Azure BlobStore offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Azure BlobStore until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Azure BlobStore offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + +- This example checks the Azure BlobStore offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Azure BlobStore offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](/tools/pulsar-admin/). + + ::: + diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-filesystem.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-filesystem.md new file mode 100644 index 0000000000000..fb39290ef8fd7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-filesystem.md @@ -0,0 +1,631 @@ +--- +id: tiered-storage-filesystem +title: Use filesystem offloader with Pulsar +sidebar_label: "Filesystem offloader" +original_id: tiered-storage-filesystem +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This chapter guides you through every step of installing and configuring the filesystem offloader and using it with Pulsar. + +## Installation + +This section describes how to install the filesystem offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or higher versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download the Pulsar tarball from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download the Pulsar tarball from the Pulsar [download page](/download/) + + * Use the [wget](https://www.gnu.org/software/wget) command to dowload the Pulsar tarball. + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you run Pulsar in a bare metal cluster, ensure that the `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you run Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you run Pulsar in a bare metal cluster, ensure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you run Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to filesystem, you need to configure some properties of the filesystem offloader driver. + +::: + +Besides, you can also configure the filesystem offloader to run it automatically or trigger it manually. + +### Configure filesystem offloader driver + +You can configure the filesystem offloader driver in the `broker.conf` or `standalone.conf` configuration file. + +````mdx-code-block + + + +- **Required** configurations are as below. + + Parameter | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | filesystem + `fileSystemURI` | Connection address, which is the URI to access the default Hadoop distributed file system. | hdfs://127.0.0.1:9000 + `offloadersDirectory` | Offloader directory | offloaders + `fileSystemProfilePath` | Hadoop profile path. The configuration file is stored in the Hadoop profile path. It contains various settings for Hadoop performance tuning. | ../conf/filesystem_offload_core_site.xml + + +- **Optional** configurations are as below. + + Parameter| Description | Example value + |---|---|--- + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.

    **Note**: it is not recommended to set this parameter in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended to set this parameter in the production environment.|5000 + +
    + + +- **Required** configurations are as below. + + Parameter | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | filesystem + `offloadersDirectory` | Offloader directory | offloaders + `fileSystemProfilePath` | NFS profile path. The configuration file is stored in the NFS profile path. It contains various settings for performance tuning. | ../conf/filesystem_offload_core_site.xml + +- **Optional** configurations are as below. + + Parameter| Description | Example value + |---|---|--- + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.

    **Note**: it is not recommended to set this parameter in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended to set this parameter in the production environment.|5000 + +
    + +
    +```` + +### Run filesystem offloader automatically + +You can configure the namespace policy to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic storage reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +| > 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, the filesystem offloader does not work until the current segment is full. + +You can configure the threshold using CLI tools, such as pulsar-admin. + +#### Example + +This example sets the filesystem offloader threshold to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Run filesystem offloader manually + +For individual topics, you can trigger the filesystem offloader manually using one of the following methods: + +- Use the REST endpoint. + +- Use CLI tools (such as pulsar-admin). + +To manually trigger the filesystem offloader via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are offloaded to the filesystem until the threshold is no longer exceeded. Older segments are offloaded first. + +#### Example + +- This example manually run the filesystem offloader using pulsar-admin. + + ```bash + + pulsar-admin topics offload --size-threshold 10M persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks filesystem offloader status using pulsar-admin. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the filesystem to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in the offloading operation, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +This section provides step-by-step instructions on how to use the filesystem offloader to move data from Pulsar to Hadoop Distributed File System (HDFS) or Network File system (NFS). + +````mdx-code-block + + + +To move data from Pulsar to HDFS, follow these steps. + +### Step 1: Prepare the HDFS environment + +This tutorial sets up a Hadoop single node cluster and uses Hadoop 3.2.1. + +:::tip + +For details about how to set up a Hadoop single node cluster, see [here](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html). + +::: + +1. Download and uncompress Hadoop 3.2.1. + + ``` + + wget https://mirrors.bfsu.edu.cn/apache/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz + + tar -zxvf hadoop-3.2.1.tar.gz -C $HADOOP_HOME + + ``` + +2. Configure Hadoop. + + ``` + + # $HADOOP_HOME/etc/hadoop/core-site.xml + + + fs.defaultFS + hdfs://localhost:9000 + + + + # $HADOOP_HOME/etc/hadoop/hdfs-site.xml + + + dfs.replication + 1 + + + + ``` + +3. Set passphraseless ssh. + + ``` + + # Now check that you can ssh to the localhost without a passphrase: + $ ssh localhost + # If you cannot ssh to localhost without a passphrase, execute the following commands + $ ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa + $ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + $ chmod 0600 ~/.ssh/authorized_keys + + ``` + +4. Start HDFS. + + ``` + + # don't execute this command repeatedly, repeat execute will cauld the clusterId of the datanode is not consistent with namenode + $HADOOP_HOME/bin/hadoop namenode -format + $HADOOP_HOME/sbin/start-dfs.sh + + ``` + +5. Navigate to the [HDFS website](http://localhost:9870/). + + You can see the **Overview** page. + + ![](/assets/FileSystem-1.png) + + 1. At the top navigation bar, click **Datanodes** to check DataNode information. + + ![](/assets/FileSystem-2.png) + + 2. Click **HTTP Address** to get more detailed information about localhost:9866. + + As can be seen below, the size of **Capacity Used** is 4 KB, which is the initial value. + + ![](/assets/FileSystem-3.png) + +### Step 2: Install the filesystem offloader + +For details, see [installation](#installation). + +### Step 3: Configure the filesystem offloader + +As indicated in the [configuration](#configuration) section, you need to configure some properties for the filesystem offloader driver before using it. This tutorial assumes that you have configured the filesystem offloader driver as below and run Pulsar in **standalone** mode. + +Set the following configurations in the `conf/standalone.conf` file. + +```conf + +managedLedgerOffloadDriver=filesystem +fileSystemURI=hdfs://127.0.0.1:9000 +fileSystemProfilePath=../conf/filesystem_offload_core_site.xml + +``` + +:::note + +For testing purposes, you can set the following two configurations to speed up ledger rollover, but it is not recommended that you set them in the production environment. + +::: + +``` + +managedLedgerMinLedgerRolloverTimeMinutes=1 +managedLedgerMaxEntriesPerLedger=100 + +``` + + + + +:::note + +In this section, it is assumed that you have enabled NFS service and set the shared path of your NFS service. In this section, `/Users/test` is used as the shared path of NFS service. + +::: + +To offload data to NFS, follow these steps. + +### Step 1: Install the filesystem offloader + +For details, see [installation](#installation). + +### Step 2: Mont your NFS to your local filesystem + +This example mounts mounts */Users/pulsar_nfs* to */Users/test*. + +``` + +mount -e 192.168.0.103:/Users/test/Users/pulsar_nfs + +``` + +### Step 3: Configure the filesystem offloader driver + +As indicated in the [configuration](#configuration) section, you need to configure some properties for the filesystem offloader driver before using it. This tutorial assumes that you have configured the filesystem offloader driver as below and run Pulsar in **standalone** mode. + +1. Set the following configurations in the `conf/standalone.conf` file. + + ```conf + + managedLedgerOffloadDriver=filesystem + fileSystemProfilePath=../conf/filesystem_offload_core_site.xml + + ``` + +2. Modify the *filesystem_offload_core_site.xml* as follows. + + ``` + + + fs.defaultFS + file:/// + + + + hadoop.tmp.dir + file:///Users/pulsar_nfs + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + + ``` + + + + +```` + +### Step 4: Offload data from BookKeeper to filesystem + +Execute the following commands in the repository where you download Pulsar tarball. For example, `~/path/to/apache-pulsar-2.5.1`. + +1. Start Pulsar standalone. + + ``` + + bin/pulsar standalone -a 127.0.0.1 + + ``` + +2. To ensure the data generated is not deleted immediately, it is recommended to set the [retention policy](cookbooks-retention-expiry.md#retention-policies), which can be either a **size** limit or a **time** limit. The larger value you set for the retention policy, the longer the data can be retained. + + ``` + + bin/pulsar-admin namespaces set-retention public/default --size 100M --time 2d + + ``` + + :::tip + + For more information about the `pulsarctl namespaces set-retention options` command, including flags, descriptions, default values, and shorthands, see [here](https://docs.streamnative.io/pulsarctl/v2.7.0.6/#-em-set-retention-em-). + + ::: + +3. Produce data using pulsar-client. + + ``` + + bin/pulsar-client produce -m "Hello FileSystem Offloader" -n 1000 public/default/fs-test + + ``` + +4. The offloading operation starts after a ledger rollover is triggered. To ensure offload data successfully, it is recommended that you wait until several ledger rollovers are triggered. In this case, you might need to wait for a second. You can check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The data of the ledger 696 is not offloaded. + + ``` + + { + "version": 1, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:46:25.821+08:00", + "ledgers": [ + { + "ledgerId": 696, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + +5. Wait a second and send more messages to the topic. + + ``` + + bin/pulsar-client produce -m "Hello FileSystem Offloader" -n 1000 public/default/fs-test + + ``` + +6. Check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The ledger 696 is rolled over. + + ``` + + { + "version": 2, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:48:52.288+08:00", + "ledgers": [ + { + "ledgerId": 696, + "entries": 1001, + "size": 81695, + "isOffloaded": false + }, + { + "ledgerId": 697, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + +7. Trigger the offloading operation manually using pulsarctl. + + ``` + + bin/pulsar-admin topics offload -s 0 public/default/fs-test + + ``` + + **Output** + + Data in ledgers before the ledge 697 is offloaded. + + ``` + + # offload info, the ledgers before 697 will be offloaded + Offload triggered for persistent://public/default/fs-test3 for messages before 697:0:-1 + + ``` + +8. Check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The data of the ledger 696 is offloaded. + + ``` + + { + "version": 4, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:52:13.25+08:00", + "ledgers": [ + { + "ledgerId": 696, + "entries": 1001, + "size": 81695, + "isOffloaded": true + }, + { + "ledgerId": 697, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + + And the **Capacity Used** is changed from 4 KB to 116.46 KB. + + ![](/assets/FileSystem-8.png) \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-gcs.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-gcs.md new file mode 100644 index 0000000000000..df1b4f6fb7edb --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-gcs.md @@ -0,0 +1,319 @@ +--- +id: tiered-storage-gcs +title: Use GCS offloader with Pulsar +sidebar_label: "GCS offloader" +original_id: tiered-storage-gcs +--- + +This chapter guides you through every step of installing and configuring the GCS offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the GCS offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [download page](/download) + + * Use [wget](https://www.gnu.org/software/wget) + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown in the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support GCS and AWS S3 for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + +## Configuration + +:::note + +Before offloading data from BookKeeper to GCS, you need to configure some properties of the GCS offloader driver. + +::: + +Besides, you can also configure the GCS offloader to run it automatically or trigger it manually. + +### Configure GCS offloader driver + +You can configure GCS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + **Required** configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver`|Offloader driver name, which is case-insensitive.|google-cloud-storage + `offloadersDirectory`|Offloader directory|offloaders + `gcsManagedLedgerOffloadBucket`|Bucket|pulsar-topic-offload + `gcsManagedLedgerOffloadRegion`|Bucket region|europe-west3 + `gcsManagedLedgerOffloadServiceAccountKeyFile`|Authentication |/Users/user-name/Downloads/project-804d5e6a6f33.json + +- **Optional** configurations are as below. + + Optional configuration|Description|Example value + |---|---|--- + `gcsManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.|2 + `managedLedgerMaxEntriesPerLedger`|The max number of entries to append to a ledger before triggering a rollover.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in GCS **must** be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you can not nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region (required) + +Bucket region is the region where a bucket is located. If a bucket region is not specified, the **default** region (`us multi-regional location`) is used. + +:::tip + +For more information about bucket location, see [here](https://cloud.google.com/storage/docs/bucket-locations). + +::: + +##### Example + +This example sets the bucket region as _europe-west3_. + +``` + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication (required) + +To enable a broker access GCS, you need to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in the configuration file `broker.conf`. + +`gcsManagedLedgerOffloadServiceAccountKeyFile` is +a JSON file, containing GCS credentials of a service account. + +##### Example + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps. + +1. Navigate to the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). + +2. Select a project or create a new one. + +3. Click **Create service account**. + +4. In the **Create service account** window, type a name for the service account and select **Furnish a new private key**. + + If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, select **Enable G Suite Domain-wide Delegation**. + +5. Click **Create**. + + :::note + + Make sure the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +6. You can get the following information and set this in `broker.conf`. + + ```conf + + gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/user-name/Downloads/project-804d5e6a6f33.json" + + ``` + + :::tip + + - For more information about how to create `gcsManagedLedgerOffloadServiceAccountKeyFile`, see [here](https://support.google.com/googleapi/answer/6158849). + - For more information about Google Cloud IAM, see [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +#### Size of block read/write + +You can configure the size of a request sent to or read from GCS in the configuration file `broker.conf`. + +Configuration|Description +|---|--- +`gcsManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from GCS.

    The **default** value is 1 MB. +`gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to GCS.

    It **can not** be smaller than 5 MB.

    The **default** value is 64 MB. + +### Configure GCS offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](/tools/pulsar-admin/) command. + +#### Example + +This example sets the GCS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Configure GCS offloader to run manually + +For individual topics, you can trigger GCS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger the GCS via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to GCS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the GCS offloader to run manually using pulsar-admin with the command `pulsar-admin topics offload (topic-name) (threshold)`. + + ```bash + + pulsar-admin topics offload persistent://my-tenant/my-namespace/topic1 10M + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks the GCS offloader status using pulsar-admin with the command `pulsar-admin topics offload-status options`. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for GCS to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the GCS offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/gcs/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/tiered-storage-overview.md b/site2/website/versioned_docs/version-2.10.x/tiered-storage-overview.md new file mode 100644 index 0000000000000..c635034f463b4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/tiered-storage-overview.md @@ -0,0 +1,52 @@ +--- +id: tiered-storage-overview +title: Overview of tiered storage +sidebar_label: "Overview" +original_id: tiered-storage-overview +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be moved from BookKeeper to long term and cheaper storage, while still allowing clients to access the backlog as if nothing has changed. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [GCS (Google Cloud Storage)](https://cloud.google.com/storage/) for long term storage. + + With jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + + :::tip + + - For more information about how to use the AWS S3 offloader with Pulsar, see [here](tiered-storage-aws.md). + + - For more information about how to use the GCS offloader with Pulsar, see [here](tiered-storage-gcs.md). + + ::: + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystems for long term storage. + + With Hadoop, it is easy to add support for more filesystems in the future. + + :::tip + + For more information about how to use the filesystem offloader with Pulsar, see [here](tiered-storage-filesystem.md). + + ::: + +## When to use tiered storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. + +For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm, you can rerun it against your full user history. + +## How does tiered storage work? + +A topic in Pulsar is backed by a **log**, known as a **managed ledger**. This log is composed of an ordered list of segments. Pulsar only writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a **segment oriented architecture**. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The tiered storage offloading mechanism takes advantage of segment oriented architecture. When offloading is requested, the segments of the log are copied one-by-one to tiered storage. All segments of the log (apart from the current segment) written to tiered storage can be offloaded. + +Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper, it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Before offloading ledgers to long term storage, you need to configure buckets, credentials, and other properties for the cloud storage service. Additionally, Pulsar uses multi-part objects to upload the segment data and brokers may crash while uploading the data. It is recommended that you add a life cycle rule for your bucket to expire incomplete multi-part upload after a day or two days to avoid getting charged for incomplete uploads. Moreover, you can trigger the offloading operation manually (via REST API or CLI) or automatically (via CLI). + +After offloading ledgers to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +For more information about tiered storage for Pulsar topics, see [here](https://github.com/apache/pulsar/wiki/PIP-17:-Tiered-storage-for-Pulsar-topics). diff --git a/site2/website/versioned_docs/version-2.10.x/transaction-api.md b/site2/website/versioned_docs/version-2.10.x/transaction-api.md new file mode 100644 index 0000000000000..25a99479639bd --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/transaction-api.md @@ -0,0 +1,172 @@ +--- +id: transactions-api +title: Transactions API +sidebar_label: "Transactions API" +original_id: transactions-api +--- + +All messages in a transaction are available only to consumers after the transaction has been committed. If a transaction has been aborted, all the writes and acknowledgments in this transaction roll back. + +## Prerequisites +1. To enable transactions in Pulsar, you need to configure the parameter in `broker.conf` file or `standalone.conf` file. + +``` + +transactionCoordinatorEnabled=true + +``` + +2. Initialize transaction coordinator metadata, so the transaction coordinators can leverage advantages of the partitioned topic, such as load balance. + +``` + +bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + +``` + +After initializing transaction coordinator metadata, you can use the transactions API. The following APIs are available. + +## Initialize Pulsar client + +You can enable transaction for transaction client and initialize transaction coordinator client. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .enableTransaction(true) + .build(); + +``` + +## Start transactions +You can start transaction in the following way. + +``` + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +``` + +## Produce transaction messages + +A transaction parameter is required when producing new transaction messages. The semantic of the transaction messages in Pulsar is `read-committed`, so the consumer cannot receive the ongoing transaction messages before the transaction is committed. + +``` + +producer.newMessage(txn).value("Hello Pulsar Transaction".getBytes()).sendAsync(); + +``` + +## Acknowledge the messages with the transaction + +The transaction acknowledgement requires a transaction parameter. The transaction acknowledgement marks the messages state to pending-ack state. When the transaction is committed, the pending-ack state becomes ack state. If the transaction is aborted, the pending-ack state becomes unack state. + +``` + +Message message = consumer.receive(); +consumer.acknowledgeAsync(message.getMessageId(), txn); + +``` + +## Commit transactions + +When the transaction is committed, consumers receive the transaction messages and the pending-ack state becomes ack state. + +``` + +txn.commit().get(); + +``` + +## Abort transaction + +When the transaction is aborted, the transaction acknowledgement is canceled and the pending-ack messages are redelivered. + +``` + +txn.abort().get(); + +``` + +### Example +The following example shows how messages are processed in transaction. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl(getPulsarServiceList().get(0).getBrokerServiceUrl()) + .statsInterval(0, TimeUnit.SECONDS) + .enableTransaction(true) + .build(); + +String sourceTopic = "public/default/source-topic"; +String sinkTopic = "public/default/sink-topic"; + +Producer sourceProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sourceTopic) + .create(); +sourceProducer.newMessage().value("hello pulsar transaction").sendAsync(); + +Consumer sourceConsumer = pulsarClient + .newConsumer(Schema.STRING) + .topic(sourceTopic) + .subscriptionName("test") + .subscriptionType(SubscriptionType.Shared) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +Producer sinkProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sinkTopic) + .sendTimeout(0, TimeUnit.MILLISECONDS) + .create(); + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +// source message acknowledgement and sink message produce belong to one transaction, +// they are combined into an atomic operation. +Message message = sourceConsumer.receive(); +sourceConsumer.acknowledgeAsync(message.getMessageId(), txn); +sinkProducer.newMessage(txn).value("sink data").sendAsync(); + +txn.commit().get(); + +``` + +## Enable batch messages in transactions + +To enable batch messages in transactions, you need to enable the batch index acknowledgement feature. The transaction acks check whether the batch index acknowledgement conflicts. + +To enable batch index acknowledgement, you need to set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + +``` + +acknowledgmentAtBatchIndexLevelEnabled=true + +``` + +And then you need to call the `enableBatchIndexAcknowledgment(true)` method in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/transaction-guarantee.md b/site2/website/versioned_docs/version-2.10.x/transaction-guarantee.md new file mode 100644 index 0000000000000..9db2d254e159f --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/transaction-guarantee.md @@ -0,0 +1,17 @@ +--- +id: transactions-guarantee +title: Transactions Guarantee +sidebar_label: "Transactions Guarantee" +original_id: transactions-guarantee +--- + +Pulsar transactions support the following guarantee. + +## Atomic multi-partition writes and multi-subscription acknowledges +Transactions enable atomic writes to multiple topics and partitions. A batch of messages in a transaction can be received from, produced to, and acknowledged by many partitions. All the operations involved in a transaction succeed or fail as a single unit. + +## Read transactional message +All the messages in a transaction are available only for consumers until the transaction is committed. + +## Acknowledge transactional message +A message is acknowledged successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/txn-how.md b/site2/website/versioned_docs/version-2.10.x/txn-how.md new file mode 100644 index 0000000000000..add072448aeb3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/txn-how.md @@ -0,0 +1,151 @@ +--- +id: txn-how +title: How transactions work? +sidebar_label: "How transactions work?" +original_id: txn-how +--- + +This section describes transaction components and how the components work together. For the complete design details, see [PIP-31: Transactional Streaming](https://docs.google.com/document/d/145VYp09JKTw9jAT-7yNyFU255FptB2_B2Fye100ZXDI/edit#heading=h.bm5ainqxosrx). + +## Key concept + +It is important to know the following key concepts, which is a prerequisite for understanding how transactions work. + +### Transaction coordinator + +The transaction coordinator (TC) is a module running inside a Pulsar broker. + +* It maintains the entire life cycle of transactions and prevents a transaction from getting into an incorrect status. + +* It handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +### Transaction log + +All the transaction metadata persists in the transaction log. The transaction log is backed by a Pulsar topic. If the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +The transaction log stores the transaction status rather than actual messages in the transaction (the actual messages are stored in the actual topic partitions). + +### Transaction buffer + +Messages produced to a topic partition within a transaction are stored in the transaction buffer (TB) of that topic partition. The messages in the transaction buffer are not visible to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +Transaction buffer stores all ongoing and aborted transactions in memory. All messages are sent to the actual partitioned Pulsar topics. After transactions are committed, the messages in the transaction buffer are materialized (visible) to consumers. When the transactions are aborted, the messages in the transaction buffer are discarded. + +### Transaction ID + +Transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +### Pending acknowledge state + +Pending acknowledge state maintains message acknowledgments within a transaction before a transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log (cursor ledger). A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. + +## Data flow + +At a high level, the data flow can be split into several steps: + +1. Begin a transaction. + +2. Publish messages with a transaction. + +3. Acknowledge messages with a transaction. + +4. End a transaction. + +To help you debug or tune the transaction for better performance, review the following diagrams and descriptions. + +### 1. Begin a transaction + +Before introducing the transaction in Pulsar, a producer is created and then messages are sent to brokers and stored in data logs. + +![](/assets/txn-3.png) + +Let’s walk through the steps for _beginning a transaction_. + +| Step | Description | +| --- | --- | +| 1.1 | The first step is that the Pulsar client finds the transaction coordinator. | +| 1.2 | The transaction coordinator allocates a transaction ID for the transaction. In the transaction log, the transaction is logged with its transaction ID and status (OPEN), which ensures the transaction status is persisted regardless of transaction coordinator crashes. | +| 1.3 | The transaction log sends the result of persisting the transaction ID to the transaction coordinator. | +| 1.4 | After the transaction status entry is logged, the transaction coordinator brings the transaction ID back to the Pulsar client. | + +### 2. Publish messages with a transaction + +In this stage, the Pulsar client enters a transaction loop, repeating the `consume-process-produce` operation for all the messages that comprise the transaction. This is a long phase and is potentially composed of multiple produce and acknowledgement requests. + +![](/assets/txn-4.png) + +Let’s walk through the steps for _publishing messages with a transaction_. + +| Step | Description | +| --- | --- | +| 2.1.1 | Before the Pulsar client produces messages to a new topic partition, it sends a request to the transaction coordinator to add the partition to the transaction. | +| 2.1.2 | The transaction coordinator logs the partition changes of the transaction into the transaction log for durability, which ensures the transaction coordinator knows all the partitions that a transaction is handling. The transaction coordinator can commit or abort changes on each partition at the end-partition phase. | +| 2.1.3 | The transaction log sends the result of logging the new partition (used for producing messages) to the transaction coordinator. | +| 2.1.4 | The transaction coordinator sends the result of adding a new produced partition to the transaction. | +| 2.2.1 | The Pulsar client starts producing messages to partitions. The flow of this part is the same as the normal flow of producing messages except that the batch of messages produced by a transaction contains transaction IDs. | +| 2.2.2 | The broker writes messages to a partition. | + +### 3. Acknowledge messages with a transaction + +In this phase, the Pulsar client sends a request to the transaction coordinator and a new subscription is acknowledged as a part of a transaction. + +![](/assets/txn-5.png) + +Let’s walk through the steps for _acknowledging messages with a transaction_. + +| Step | Description | +| --- | --- | +| 3.1.1 | The Pulsar client sends a request to add an acknowledged subscription to the transaction coordinator. | +| 3.1.2 | The transaction coordinator logs the addition of subscription, which ensures that it knows all subscriptions handled by a transaction and can commit or abort changes on each subscription at the end phase. | +| 3.1.3 | The transaction log sends the result of logging the new partition (used for acknowledging messages) to the transaction coordinator. | +| 3.1.4 | The transaction coordinator sends the result of adding the new acknowledged partition to the transaction. | +| 3.2 | The Pulsar client acknowledges messages on the subscription. The flow of this part is the same as the normal flow of acknowledging messages except that the acknowledged request carries a transaction ID. | +| 3.3 | The broker receiving the acknowledgement request checks if the acknowledgment belongs to a transaction or not. | + +### 4. End a transaction + +At the end of a transaction, the Pulsar client decides to commit or abort the transaction. The transaction can be aborted when a conflict is detected on acknowledging messages. + +#### 4.1 End transaction request + +When the Pulsar client finishes a transaction, it issues an end transaction request. + +![](/assets/txn-6.png) + +Let’s walk through the steps for _ending the transaction_. + +| Step | Description | +| --- | --- | +| 4.1.1 | The Pulsar client issues an end transaction request (with a field indicating whether the transaction is to be committed or aborted) to the transaction coordinator. | +| 4.1.2 | The transaction coordinator writes a COMMITTING or ABORTING message to its transaction log. | +| 4.1.3 | The transaction log sends the result of logging the committing or aborting status. | + +#### 4.2 Finalize a transaction + +The transaction coordinator starts the process of committing or aborting messages to all the partitions involved in this transaction. + +![](/assets/txn-7.png) + +Let’s walk through the steps for _finalizing a transaction_. + +| Step | Description | +| --- | --- | +| 4.2.1 | The transaction coordinator commits transactions on subscriptions and commits transactions on partitions at the same time. | +| 4.2.2 | The broker (produce) writes produced committed markers to the actual partitions. At the same time, the broker (ack) writes acked committed marks to the subscription pending ack partitions. | +| 4.2.3 | The data log sends the result of writing produced committed marks to the broker. At the same time, pending ack data log sends the result of writing acked committed marks to the broker. The cursor moves to the next position. | + +#### 4.3 Mark a transaction as COMMITTED or ABORTED + +The transaction coordinator writes the final transaction status to the transaction log to complete the transaction. + +![](/assets/txn-8.png) + +Let’s walk through the steps for _marking a transaction as COMMITTED or ABORTED_. + +| Step | Description | +| --- | --- | +| 4.3.1 | After all produced messages and acknowledgements to all partitions involved in this transaction have been successfully committed or aborted, the transaction coordinator writes the final COMMITTED or ABORTED transaction status messages to its transaction log, indicating that the transaction is complete. All the messages associated with the transaction in its transaction log can be safely removed. | +| 4.3.2 | The transaction log sends the result of the committed transaction to the transaction coordinator. | +| 4.3.3 | The transaction coordinator sends the result of the committed transaction to the Pulsar client. | diff --git a/site2/website/versioned_docs/version-2.10.x/txn-monitor.md b/site2/website/versioned_docs/version-2.10.x/txn-monitor.md new file mode 100644 index 0000000000000..08e4a1be32036 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/txn-monitor.md @@ -0,0 +1,10 @@ +--- +id: txn-monitor +title: How to monitor transactions? +sidebar_label: "How to monitor transactions?" +original_id: txn-monitor +--- + +You can monitor the status of the transactions in Prometheus and Grafana using the [transaction metrics](reference-metrics.md#pulsar-transaction). + +For how to configure Prometheus and Grafana, see [here](deploy-monitoring.md). diff --git a/site2/website/versioned_docs/version-2.10.x/txn-use.md b/site2/website/versioned_docs/version-2.10.x/txn-use.md new file mode 100644 index 0000000000000..b36721a6c60d6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/txn-use.md @@ -0,0 +1,105 @@ +--- +id: txn-use +title: How to use transactions? +sidebar_label: "How to use transactions?" +original_id: txn-use +--- + +## Transaction API + +The transaction feature is primarily a server-side and protocol-level feature. You can use the transaction feature via the [transaction API](/api/admin/), which is available in **Pulsar 2.8.0 or later**. + +To use the transaction API, you do not need any additional settings in the Pulsar client. **By default**, transactions is **disabled**. + +Currently, transaction API is only available for **Java** clients. Support for other language clients will be added in the future releases. + +## Quick start + +This section provides an example of how to use the transaction API to send and receive messages in a Java client. + +1. Start Pulsar 2.8.0 or later. + +2. Enable transaction. + + Change the configuration in the `broker.conf` file. + + ``` + + transactionCoordinatorEnabled=true + + ``` + + If you want to enable batch messages in transactions, follow the steps below. + + Set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + + ``` + + acknowledgmentAtBatchIndexLevelEnabled=true + + ``` + +3. Initialize transaction coordinator metadata. + + The transaction coordinator can leverage the advantages of partitioned topics (such as load balance). + + **Input** + + ``` + + bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + + ``` + + **Output** + + ``` + + Transaction coordinator metadata setup success + + ``` + +4. Initialize a Pulsar client. + + ``` + + PulsarClient client = PulsarClient.builder() + + .serviceUrl(“pulsar://localhost:6650”) + + .enableTransaction(true) + + .build(); + + ``` + +Now you can start using the transaction API to send and receive messages. Below is an example of a `consume-process-produce` application written in Java. + +![](/assets/txn-9.png) + +Let’s walk through this example step by step. + +| Step | Description | +| --- | --- | +| 1. Start a transaction. | The application opens a new transaction by calling PulsarClient.newTransaction. It specifics the transaction timeout as 1 minute. If the transaction is not committed within 1 minute, the transaction is automatically aborted. | +| 2. Receive messages from topics. | The application creates two normal consumers to receive messages from topic input-topic-1 and input-topic-2 respectively. | +| 3. Publish messages to topics with the transaction. | The application creates two producers to produce the resulting messages to the output topic _output-topic-1_ and output-topic-2 respectively. The application applies the processing logic and generates two output messages. The application sends those two output messages as part of the transaction opened in the first step via Producer.newMessage(Transaction). | +| 4. Acknowledge the messages with the transaction. | In the same transaction, the application acknowledges the two input messages. | +| 5. Commit the transaction. | The application commits the transaction by calling Transaction.commit() on the open transaction. The commit operation ensures the two input messages are marked as acknowledged and the two output messages are written successfully to the output topics. | + +[1] Example of enabling batch messages ack in transactions in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + +.subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.10.x/txn-what.md b/site2/website/versioned_docs/version-2.10.x/txn-what.md new file mode 100644 index 0000000000000..f8bf3eb7e56b8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/txn-what.md @@ -0,0 +1,60 @@ +--- +id: txn-what +title: What are transactions? +sidebar_label: "What are transactions?" +original_id: txn-what +--- + +Transactions strengthen the message delivery semantics of Apache Pulsar and [processing guarantees of Pulsar Functions](functions-overview.md#processing-guarantees). The Pulsar Transaction API supports atomic writes and acknowledgments across multiple topics. + +Transactions allow: + +- A producer to send a batch of messages to multiple topics where all messages in the batch are eventually visible to any consumer, or none are ever visible to consumers. + +- End-to-end exactly-once semantics (execute a `consume-process-produce` operation exactly once). + +## Transaction semantics + +Pulsar transactions have the following semantics: + +* All operations within a transaction are committed as a single unit. + + * Either all messages are committed, or none of them are. + + * Each message is written or processed exactly once, without data loss or duplicates (even in the event of failures). + + * If a transaction is aborted, all the writes and acknowledgments in this transaction rollback. + +* A group of messages in a transaction can be received from, produced to, and acknowledged by multiple partitions. + + * Consumers are only allowed to read committed (acked) messages. In other words, the broker does not deliver transactional messages which are part of an open transaction or messages which are part of an aborted transaction. + + * Message writes across multiple partitions are atomic. + + * Message acks across multiple subscriptions are atomic. A message is acked successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. + +## Transactions and stream processing + +Stream processing on Pulsar is a `consume-process-produce` operation on Pulsar topics: + +* `Consume`: a source operator that runs a Pulsar consumer reads messages from one or multiple Pulsar topics. + +* `Process`: a processing operator transforms the messages. + +* `Produce`: a sink operator that runs a Pulsar producer writes the resulting messages to one or multiple Pulsar topics. + +![](/assets/txn-2.png) + +Pulsar transactions support end-to-end exactly-once stream processing, which means messages are not lost from a source operator and messages are not duplicated to a sink operator. + +## Use case + +Prior to Pulsar 2.8.0, there was no easy way to build stream processing applications with Pulsar to achieve exactly-once processing guarantees. With the transaction introduced in Pulsar 2.8.0, the following services support exactly-once semantics: + +* [Pulsar Flink connector](https://flink.apache.org/2021/01/07/pulsar-flink-connector-270.html) + + Prior to Pulsar 2.8.0, if you want to build stream applications using Pulsar and Flink, the Pulsar Flink connector only supported exactly-once source connector and at-least-once sink connector, which means the highest processing guarantee for end-to-end was at-least-once, there was possibility that the resulting messages from streaming applications produce duplicated messages to the resulting topics in Pulsar. + + With the transaction introduced in Pulsar 2.8.0, the Pulsar Flink sink connector can support exactly-once semantics by implementing the designated `TwoPhaseCommitSinkFunction` and hooking up the Flink sink message lifecycle with Pulsar transaction API. + +* Support for Pulsar Functions and other connectors will be added in the future releases. diff --git a/site2/website/versioned_docs/version-2.10.x/txn-why.md b/site2/website/versioned_docs/version-2.10.x/txn-why.md new file mode 100644 index 0000000000000..e7273379f7949 --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/txn-why.md @@ -0,0 +1,45 @@ +--- +id: txn-why +title: Why transactions? +sidebar_label: "Why transactions?" +original_id: txn-why +--- + +Pulsar transactions (txn) enable event streaming applications to consume, process, and produce messages in one atomic operation. The reason for developing this feature can be summarized as below. + +## Demand of stream processing + +The demand for stream processing applications with stronger processing guarantees has grown along with the rise of stream processing. For example, in the financial industry, financial institutions use stream processing engines to process debits and credits for users. This type of use case requires that every message is processed exactly once, without exception. + +In other words, if a stream processing application consumes message A and +produces the result as a message B (B = f(A)), then exactly-once processing +guarantee means that A can only be marked as consumed if and only if B is +successfully produced, and vice versa. + +![](/assets/txn-1.png) + +The Pulsar transactions API strengthens the message delivery semantics and the processing guarantees for stream processing. It enables stream processing applications to consume, process, and produce messages in one atomic operation. That means, a batch of messages in a transaction can be received from, produced to and acknowledged by many topic partitions. All the operations involved in a transaction succeed or fail as one single unit. + +## Limitation of idempotent producer + +Avoiding data loss or duplication can be achieved by using the Pulsar idempotent producer, but it does not provide guarantees for writes across multiple partitions. + +In Pulsar, the highest level of message delivery guarantee is using an [idempotent producer](concepts-messaging.md#producer-idempotency) with the exactly once semantic at one single partition, that is, each message is persisted exactly once without data loss and duplication. However, there are some limitations in this solution: + +- Due to the monotonic increasing sequence ID, this solution only works on a single partition and within a single producer session (that is, for producing one message), so there is no atomicity when producing multiple messages to one or multiple partitions. + + In this case, if there are some failures (for example, client / broker / bookie crashes, network failure, and more) in the process of producing and receiving messages, messages are re-processed and re-delivered, which may cause data loss or data duplication: + + - For the producer: if the producer retry sending messages, some messages are persisted multiple times; if the producer does not retry sending messages, some messages are persisted once and other messages are lost. + + - For the consumer: since the consumer does not know whether the broker has received messages or not, the consumer may not retry sending acks, which causes it to receive duplicate messages. + +- Similarly, for Pulsar Function, it only guarantees exactly once semantics for an idempotent function on a single event rather than processing multiple events or producing multiple results that can happen exactly. + + For example, if a function accepts multiple events and produces one result (for example, window function), the function may fail between producing the result and acknowledging the incoming messages, or even between acknowledging individual events, which causes all (or some) incoming messages to be re-delivered and reprocessed, and a new result is generated. + + However, many scenarios need atomic guarantees across multiple partitions and sessions. + +- Consumers need to rely on more mechanisms to acknowledge (ack) messages once. + + For example, consumers are required to store the MessageID along with its acked state. After the topic is unloaded, the subscription can recover the acked state of this MessageID in memory when the topic is loaded again. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.10.x/window-functions-context.md b/site2/website/versioned_docs/version-2.10.x/window-functions-context.md new file mode 100644 index 0000000000000..f80fea57989ef --- /dev/null +++ b/site2/website/versioned_docs/version-2.10.x/window-functions-context.md @@ -0,0 +1,581 @@ +--- +id: window-functions-context +title: Window Functions Context +sidebar_label: "Window Functions: Context" +original_id: window-functions-context +--- + +Java SDK provides access to a **window context object** that can be used by a window function. This context object provides a wide variety of information and functionality for Pulsar window functions as below. + +- [Spec](#spec) + + * Names of all input topics and the output topic associated with the function. + * Tenant and namespace associated with the function. + * Pulsar window function name, ID, and version. + * ID of the Pulsar function instance running the window function. + * Number of instances that invoke the window function. + * Built-in type or custom class name of the output schema. + +- [Logger](#logger) + + * Logger object used by the window function, which can be used to create window function log messages. + +- [User config](#user-config) + + * Access to arbitrary user configuration values. + +- [Routing](#routing) + + * Routing is supported in Pulsar window functions. Pulsar window functions send messages to arbitrary topics as per the `publish` interface. + +- [Metrics](#metrics) + + * Interface for recording metrics. + +- [State storage](#state-storage) + + * Interface for storing and retrieving state in [state storage](#state-storage). + +## Spec + +Spec contains the basic information of a function. + +### Get input topics + +The `getInputTopics` method gets the **name list** of all input topics. + +This example demonstrates how to get the name list of all input topics in a Java window function. + +```java + +public class GetInputTopicsWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Collection inputTopics = context.getInputTopics(); + System.out.println(inputTopics); + + return null; + } + +} + +``` + +### Get output topic + +The `getOutputTopic` method gets the **name of a topic** to which the message is sent. + +This example demonstrates how to get the name of an output topic in a Java window function. + +```java + +public class GetOutputTopicWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String outputTopic = context.getOutputTopic(); + System.out.println(outputTopic); + + return null; + } +} + +``` + +### Get tenant + +The `getTenant` method gets the tenant name associated with the window function. + +This example demonstrates how to get the tenant name in a Java window function. + +```java + +public class GetTenantWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String tenant = context.getTenant(); + System.out.println(tenant); + + return null; + } + +} + +``` + +### Get namespace + +The `getNamespace` method gets the namespace associated with the window function. + +This example demonstrates how to get the namespace in a Java window function. + +```java + +public class GetNamespaceWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String ns = context.getNamespace(); + System.out.println(ns); + + return null; + } + +} + +``` + +### Get function name + +The `getFunctionName` method gets the window function name. + +This example demonstrates how to get the function name in a Java window function. + +```java + +public class GetNameOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionName = context.getFunctionName(); + System.out.println(functionName); + + return null; + } + +} + +``` + +### Get function ID + +The `getFunctionId` method gets the window function ID. + +This example demonstrates how to get the function ID in a Java window function. + +```java + +public class GetFunctionIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionID = context.getFunctionId(); + System.out.println(functionID); + + return null; + } + +} + +``` + +### Get function version + +The `getFunctionVersion` method gets the window function version. + +This example demonstrates how to get the function version of a Java window function. + +```java + +public class GetVersionOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionVersion = context.getFunctionVersion(); + System.out.println(functionVersion); + + return null; + } + +} + +``` + +### Get instance ID + +The `getInstanceId` method gets the instance ID of a window function. + +This example demonstrates how to get the instance ID in a Java window function. + +```java + +public class GetInstanceIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int instanceId = context.getInstanceId(); + System.out.println(instanceId); + + return null; + } + +} + +``` + +### Get num instances + +The `getNumInstances` method gets the number of instances that invoke the window function. + +This example demonstrates how to get the number of instances in a Java window function. + +```java + +public class GetNumInstancesWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int numInstances = context.getNumInstances(); + System.out.println(numInstances); + + return null; + } + +} + +``` + +### Get output schema type + +The `getOutputSchemaType` method gets the built-in type or custom class name of the output schema. + +This example demonstrates how to get the output schema type of a Java window function. + +```java + +public class GetOutputSchemaTypeWindowFunction implements WindowFunction { + + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String schemaType = context.getOutputSchemaType(); + System.out.println(schemaType); + + return null; + } +} + +``` + +## Logger + +Pulsar window functions using Java SDK has access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. + +This example logs either a `WARNING`-level or `INFO`-level log based on whether the incoming string contains the word `danger` or not in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; +import org.slf4j.Logger; + +public class LoggingWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Logger log = context.getLogger(); + for (Record record : inputs) { + log.info(record + "-window-log"); + } + return null; + } + +} + +``` + +If you need your function to produce logs, specify a log topic when creating or running the function. + +```bash + +bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +You can access all logs produced by `LoggingFunction` via the `persistent://public/default/logging-function-logs` topic. + +## Metrics + +Pulsar window functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar window function uses the language-native interface for Java, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can record metrics using the context object on a per-key basis. + +This example sets a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; + + +/** + * Example function that wants to keep track of + * the event time of each message sent. + */ +public class UserMetricWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + + for (Record record : inputs) { + if (record.getEventTime().isPresent()) { + context.recordMetric("MessageEventTime", record.getEventTime().get().doubleValue()); + } + } + + return null; + } +} + +``` + +## User config + +When you run or update Pulsar Functions that are created using SDK, you can pass arbitrary key/value pairs to them with the `--user-config` flag. Key/value pairs **must** be specified as JSON. + +This example passes a user configured key/value to a function. + +```bash + +bin/pulsar-admin functions create \ + --name word-filter \ + --user-config '{"forbidden-word":"rosebud"}' \ + # Other function configs + +``` + +### API +You can use the following APIs to get user-defined information for window functions. +#### getUserConfigMap + +`getUserConfigMap` API gets a map of all user-defined key/value configurations for the window function. + +```java + +/** + * Get a map of all user-defined key/value configs for the function. + * + * @return The full map of user-defined config values + */ + Map getUserConfigMap(); + +``` + +#### getUserConfigValue + +The `getUserConfigValue` API gets a user-defined key/value. + +```java + +/** + * Get any user-defined key/value. + * + * @param key The key + * @return The Optional value specified by the user for that key. + */ + Optional getUserConfigValue(String key); + +``` + +#### getUserConfigValueOrDefault + +The `getUserConfigValueOrDefault` API gets a user-defined key/value or a default value if none is present. + +```java + +/** + * Get any user-defined key/value or a default value if none is present. + * + * @param key + * @param defaultValue + * @return Either the user config value associated with a given key or a supplied default value + */ + Object getUserConfigValueOrDefault(String key, Object defaultValue); + +``` + +This example demonstrates how to access key/value pairs provided to Pulsar window functions. + +Java SDK context object enables you to access key/value pairs provided to Pulsar window functions via the command line (as JSON). + +:::tip + +For all key/value pairs passed to Java window functions, both the `key` and the `value` are `String`. To set the value to be a different type, you need to deserialize it from the `String` type. + +::: + +This example passes a key/value pair in a Java window function. + +```bash + +bin/pulsar-admin functions create \ + --user-config '{"word-of-the-day":"verdure"}' \ + # Other function configs + +``` + +This example accesses values in a Java window function. + +The `UserConfigFunction` function logs the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The user config of `word-of-the-day` is changed **only** when the function is updated with a new config value via +multiple ways, such as the command line tool or REST API. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigWindowFunction implements WindowFunction { + @Override + public String process(Collection> input, WindowContext context) throws Exception { + Optional whatToWrite = context.getUserConfigValue("WhatToWrite"); + if (whatToWrite.get() != null) { + return (String)whatToWrite.get(); + } else { + return "Not a nice way"; + } + } + +} + +``` + +If no value is provided, you can access the entire user config map or set a default value. + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +## Routing + +You can use the `context.publish()` interface to publish as many results as you want. + +This example shows that the `PublishFunction` class uses the built-in function in the context to publish messages to the `publishTopic` in a Java function. + +```java + +public class PublishWindowFunction implements WindowFunction { + @Override + public Void process(Collection> input, WindowContext context) throws Exception { + String publishTopic = (String) context.getUserConfigValueOrDefault("publish-topic", "publishtopic"); + String output = String.format("%s!", input); + context.publish(publishTopic, output); + + return null; + } + +} + +``` + +## State storage + +Pulsar window functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Apache Pulsar installation (including the standalone installation) includes the deployment of BookKeeper bookies. + +Apache Pulsar integrates with Apache BookKeeper `table service` to store the `state` for functions. For example, the `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions state APIs. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data—counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function and shared between instances of that function. + +Currently, Pulsar window functions expose Java API to access, update, and manage states. These APIs are available in the context object when you use Java SDK functions. + +| Java API| Description +|---|--- +|`incrCounter`|Increases a built-in distributed counter referred by key. +|`getCounter`|Gets the counter value for the key. +|`putState`|Updates the state value for the key. + +You can use the following APIs to access, update, and manage states in Java window functions. + +#### incrCounter + +The `incrCounter` API increases a built-in distributed counter referred by key. + +Applications use the `incrCounter` API to change the counter of a given `key` by the given `amount`. If the `key` does not exist, a new key is created. + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +#### getCounter + +The `getCounter` API gets the counter value for the key. + +Applications uses the `getCounter` API to retrieve the counter of a given `key` changed by the `incrCounter` API. + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +Except the `getCounter` API, Pulsar also exposes a general key/value API (`putState`) for functions to store general key/value state. + +#### putState + +The `putState` API updates the state value for the key. + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +This example demonstrates how applications store states in Pulsar window functions. + +The logic of the `WordCountWindowFunction` is simple and straightforward. + +1. The function first splits the received string into multiple words using regex `\\.`. + +2. For each `word`, the function increments the corresponding `counter` by 1 via `incrCounter(key, amount)`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + for (Record input : inputs) { + Arrays.asList(input.getValue().split("\\.")).forEach(word -> context.incrCounter(word, 1)); + } + return null; + + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/about.md b/site2/website/versioned_docs/version-2.8.x/about.md new file mode 100644 index 0000000000000..cb607269b460d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/about.md @@ -0,0 +1,56 @@ +--- +slug: / +id: about +title: Welcome to the doc portal! +sidebar_label: "About" +--- + +import BlockLinks from "@site/src/components/BlockLinks"; +import BlockLink from "@site/src/components/BlockLink"; +import { docUrl } from "@site/src/utils/index"; + + +# Welcome to the doc portal! +*** + +This portal holds a variety of support documents to help you work with Pulsar . If you’re a beginner, there are tutorials and explainers to help you understand Pulsar and how it works. + +If you’re an experienced coder, review this page to learn the easiest way to access the specific content you’re looking for. + +## Get Started Now + + + + + + + + + +## Navigation +*** + +There are several ways to get around in the doc portal. The index navigation pane is a table of contents for the entire archive. The archive is divided into sections, like chapters in a book. Click the title of the topic to view it. + +In-context links provide an easy way to immediately reference related topics. Click the underlined term to view the topic. + +Links to related topics can be found at the bottom of each topic page. Click the link to view the topic. + +![Page Linking](/assets/page-linking.png) + +## Continuous Improvement +*** +As you probably know, we are working on a new user experience for our documentation portal that will make learning about and building on top of Apache Pulsar a much better experience. Whether you need overview concepts, how-to procedures, curated guides or quick references, we’re building content to support it. This welcome page is just the first step. We will be providing updates every month. + +## Help Improve These Documents +*** + +You’ll notice an Edit button at the bottom and top of each page. Click it to open a landing page with instructions for requesting changes to posted documents. These are your resources. Participation is not only welcomed – it’s essential! + +## Join the Community! +*** + +The Pulsar community on github is active, passionate, and knowledgeable. Join discussions, voice opinions, suggest features, and dive into the code itself. Find your Pulsar family here at [apache/pulsar](https://github.com/apache/pulsar). + +An equally passionate community can be found in the [Pulsar Slack channel](https://apache-pulsar.slack.com/). You’ll need an invitation to join, but many Github Pulsar community members are Slack members too. Join, hang out, learn, and make some new friends. + diff --git a/site2/website/versioned_docs/version-2.8.x/adaptors-kafka.md b/site2/website/versioned_docs/version-2.8.x/adaptors-kafka.md new file mode 100644 index 0000000000000..ad0d886a9e04b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/adaptors-kafka.md @@ -0,0 +1,274 @@ +--- +id: adaptors-kafka +title: Pulsar adaptor for Apache Kafka +sidebar_label: "Kafka client wrapper" +original_id: adaptors-kafka +--- + + +Pulsar provides an easy option for applications that are currently written using the [Apache Kafka](http://kafka.apache.org) Java client API. + +## Using the Pulsar Kafka compatibility wrapper + +In an existing application, change the regular Kafka client dependency and replace it with the Pulsar Kafka wrapper. Remove the following dependency in `pom.xml`: + +```xml + + + org.apache.kafka + kafka-clients + 0.10.2.1 + + +``` + +Then include this dependency for the Pulsar Kafka wrapper: + +```xml + + + org.apache.pulsar + pulsar-client-kafka + @pulsar:version@ + + +``` + +With the new dependency, the existing code works without any changes. You need to adjust the configuration, and make sure it points the +producers and consumers to Pulsar service rather than Kafka, and uses a particular +Pulsar topic. + +## Using the Pulsar Kafka compatibility wrapper together with existing kafka client + +When migrating from Kafka to Pulsar, the application might use the original kafka client +and the pulsar kafka wrapper together during migration. You should consider using the +unshaded pulsar kafka client wrapper. + +```xml + + + org.apache.pulsar + pulsar-client-kafka-original + @pulsar:version@ + + +``` + +When using this dependency, construct producers using `org.apache.kafka.clients.producer.PulsarKafkaProducer` +instead of `org.apache.kafka.clients.producer.KafkaProducer` and `org.apache.kafka.clients.producer.PulsarKafkaConsumer` for consumers. + +## Producer example + +```java + +// Topic needs to be a regular Pulsar topic +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); + +props.put("key.serializer", IntegerSerializer.class.getName()); +props.put("value.serializer", StringSerializer.class.getName()); + +Producer producer = new KafkaProducer(props); + +for (int i = 0; i < 10; i++) { + producer.send(new ProducerRecord(topic, i, "hello-" + i)); + log.info("Message {} sent successfully", i); +} + +producer.close(); + +``` + +## Consumer example + +```java + +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); +props.put("group.id", "my-subscription-name"); +props.put("enable.auto.commit", "false"); +props.put("key.deserializer", IntegerDeserializer.class.getName()); +props.put("value.deserializer", StringDeserializer.class.getName()); + +Consumer consumer = new KafkaConsumer(props); +consumer.subscribe(Arrays.asList(topic)); + +while (true) { + ConsumerRecords records = consumer.poll(100); + records.forEach(record -> { + log.info("Received record: {}", record); + }); + + // Commit last offset + consumer.commitSync(); +} + +``` + +## Complete Examples + +You can find the complete producer and consumer examples [here](https://github.com/apache/pulsar-adapters/tree/master/pulsar-client-kafka-compat/pulsar-client-kafka-tests/src/test/java/org/apache/pulsar/client/kafka/compat/examples). + +## Compatibility matrix + +Currently the Pulsar Kafka wrapper supports most of the operations offered by the Kafka API. + +### Producer + +APIs: + +| Producer Method | Supported | Notes | +|:------------------------------------------------------------------------------|:----------|:-------------------------------------------------------------------------| +| `Future send(ProducerRecord record)` | Yes | | +| `Future send(ProducerRecord record, Callback callback)` | Yes | | +| `void flush()` | Yes | | +| `List partitionsFor(String topic)` | No | | +| `Map metrics()` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | + +Properties: + +| Config property | Supported | Notes | +|:----------------------------------------|:----------|:------------------------------------------------------------------------------| +| `acks` | Ignored | Durability and quorum writes are configured at the namespace level | +| `auto.offset.reset` | Yes | It uses a default value of `earliest` if you do not give a specific setting. | +| `batch.size` | Ignored | | +| `bootstrap.servers` | Yes | | +| `buffer.memory` | Ignored | | +| `client.id` | Ignored | | +| `compression.type` | Yes | Allows `gzip` and `lz4`. No `snappy`. | +| `connections.max.idle.ms` | Yes | Only support up to 2,147,483,647,000(Integer.MAX_VALUE * 1000) ms of idle time| +| `interceptor.classes` | Yes | | +| `key.serializer` | Yes | | +| `linger.ms` | Yes | Controls the group commit time when batching messages | +| `max.block.ms` | Ignored | | +| `max.in.flight.requests.per.connection` | Ignored | In Pulsar ordering is maintained even with multiple requests in flight | +| `max.request.size` | Ignored | | +| `metric.reporters` | Ignored | | +| `metrics.num.samples` | Ignored | | +| `metrics.sample.window.ms` | Ignored | | +| `partitioner.class` | Yes | | +| `receive.buffer.bytes` | Ignored | | +| `reconnect.backoff.ms` | Ignored | | +| `request.timeout.ms` | Ignored | | +| `retries` | Ignored | Pulsar client retries with exponential backoff until the send timeout expires. | +| `send.buffer.bytes` | Ignored | | +| `timeout.ms` | Yes | | +| `value.serializer` | Yes | | + + +### Consumer + +The following table lists consumer APIs. + +| Consumer Method | Supported | Notes | +|:--------------------------------------------------------------------------------------------------------|:----------|:------| +| `Set assignment()` | No | | +| `Set subscription()` | Yes | | +| `void subscribe(Collection topics)` | Yes | | +| `void subscribe(Collection topics, ConsumerRebalanceListener callback)` | No | | +| `void assign(Collection partitions)` | No | | +| `void subscribe(Pattern pattern, ConsumerRebalanceListener callback)` | No | | +| `void unsubscribe()` | Yes | | +| `ConsumerRecords poll(long timeoutMillis)` | Yes | | +| `void commitSync()` | Yes | | +| `void commitSync(Map offsets)` | Yes | | +| `void commitAsync()` | Yes | | +| `void commitAsync(OffsetCommitCallback callback)` | Yes | | +| `void commitAsync(Map offsets, OffsetCommitCallback callback)` | Yes | | +| `void seek(TopicPartition partition, long offset)` | Yes | | +| `void seekToBeginning(Collection partitions)` | Yes | | +| `void seekToEnd(Collection partitions)` | Yes | | +| `long position(TopicPartition partition)` | Yes | | +| `OffsetAndMetadata committed(TopicPartition partition)` | Yes | | +| `Map metrics()` | No | | +| `List partitionsFor(String topic)` | No | | +| `Map> listTopics()` | No | | +| `Set paused()` | No | | +| `void pause(Collection partitions)` | No | | +| `void resume(Collection partitions)` | No | | +| `Map offsetsForTimes(Map timestampsToSearch)` | No | | +| `Map beginningOffsets(Collection partitions)` | No | | +| `Map endOffsets(Collection partitions)` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | +| `void wakeup()` | No | | + +Properties: + +| Config property | Supported | Notes | +|:--------------------------------|:----------|:------------------------------------------------------| +| `group.id` | Yes | Maps to a Pulsar subscription name | +| `max.poll.records` | Yes | | +| `max.poll.interval.ms` | Ignored | Messages are "pushed" from broker | +| `session.timeout.ms` | Ignored | | +| `heartbeat.interval.ms` | Ignored | | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `enable.auto.commit` | Yes | | +| `auto.commit.interval.ms` | Ignored | With auto-commit, acks are sent immediately to broker | +| `partition.assignment.strategy` | Ignored | | +| `auto.offset.reset` | Yes | Only support earliest and latest. | +| `fetch.min.bytes` | Ignored | | +| `fetch.max.bytes` | Ignored | | +| `fetch.max.wait.ms` | Ignored | | +| `interceptor.classes` | Yes | | +| `metadata.max.age.ms` | Ignored | | +| `max.partition.fetch.bytes` | Ignored | | +| `send.buffer.bytes` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `client.id` | Ignored | | + + +## Customize Pulsar configurations + +You can configure Pulsar authentication provider directly from the Kafka properties. + +### Pulsar client properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.authentication.class`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-org.apache.pulsar.client.api.Authentication-) | | Configure to auth provider. For example, `org.apache.pulsar.client.impl.auth.AuthenticationTls`.| +| [`pulsar.authentication.params.map`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.util.Map-) | | Map which represents parameters for the Authentication-Plugin. | +| [`pulsar.authentication.params.string`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.lang.String-) | | String which represents parameters for the Authentication-Plugin, for example, `key1:val1,key2:val2`. | +| [`pulsar.use.tls`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTls-boolean-) | `false` | Enable TLS transport encryption. | +| [`pulsar.tls.trust.certs.file.path`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsTrustCertsFilePath-java.lang.String-) | | Path for the TLS trust certificate store. | +| [`pulsar.tls.allow.insecure.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsAllowInsecureConnection-boolean-) | `false` | Accept self-signed certificates from brokers. | +| [`pulsar.operation.timeout.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setOperationTimeout-int-java.util.concurrent.TimeUnit-) | `30000` | General operations timeout. | +| [`pulsar.stats.interval.seconds`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setStatsInterval-long-java.util.concurrent.TimeUnit-) | `60` | Pulsar client lib stats printing interval. | +| [`pulsar.num.io.threads`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setIoThreads-int-) | `1` | The number of Netty IO threads to use. | +| [`pulsar.connections.per.broker`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConnectionsPerBroker-int-) | `1` | The maximum number of connection to each broker. | +| [`pulsar.use.tcp.nodelay`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTcpNoDelay-boolean-) | `true` | TCP no-delay. | +| [`pulsar.concurrent.lookup.requests`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConcurrentLookupRequest-int-) | `50000` | The maximum number of concurrent topic lookups. | +| [`pulsar.max.number.rejected.request.per.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setMaxNumberOfRejectedRequestPerConnection-int-) | `50` | The threshold of errors to forcefully close a connection. | +| [`pulsar.keepalive.interval.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientBuilder.html#keepAliveInterval-int-java.util.concurrent.TimeUnit-)| `30000` | Keep alive interval for each client-broker-connection. | + + +### Pulsar producer properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.producer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setProducerName-java.lang.String-) | | Specify the producer name. | +| [`pulsar.producer.initial.sequence.id`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setInitialSequenceId-long-) | | Specify baseline for sequence ID of this producer. | +| [`pulsar.producer.max.pending.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessages-int-) | `1000` | Set the maximum size of the message queue pending to receive an acknowledgment from the broker. | +| [`pulsar.producer.max.pending.messages.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessagesAcrossPartitions-int-) | `50000` | Set the maximum number of pending messages across all the partitions. | +| [`pulsar.producer.batching.enabled`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingEnabled-boolean-) | `true` | Control whether automatic batching of messages is enabled for the producer. | +| [`pulsar.producer.batching.max.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingMaxMessages-int-) | `1000` | The maximum number of messages in a batch. | +| [`pulsar.block.if.producer.queue.full`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBlockIfQueueFull-boolean-) | | Specify the block producer if queue is full. | + + +### Pulsar consumer Properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.consumer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setConsumerName-java.lang.String-) | | Specify the consumer name. | +| [`pulsar.consumer.receiver.queue.size`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setReceiverQueueSize-int-) | 1000 | Set the size of the consumer receiver queue. | +| [`pulsar.consumer.acknowledgments.group.time.millis`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#acknowledgmentGroupTime-long-java.util.concurrent.TimeUnit-) | 100 | Set the maximum amount of group time for consumers to send the acknowledgments to the broker. | +| [`pulsar.consumer.total.receiver.queue.size.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setMaxTotalReceiverQueueSizeAcrossPartitions-int-) | 50000 | Set the maximum size of the total receiver queue across partitions. | +| [`pulsar.consumer.subscription.topics.mode`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#subscriptionTopicsMode-Mode-) | PersistentOnly | Set the subscription topic mode for consumers. | diff --git a/site2/website/versioned_docs/version-2.8.x/adaptors-spark.md b/site2/website/versioned_docs/version-2.8.x/adaptors-spark.md new file mode 100644 index 0000000000000..e14f13b5d4b07 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/adaptors-spark.md @@ -0,0 +1,91 @@ +--- +id: adaptors-spark +title: Pulsar adaptor for Apache Spark +sidebar_label: "Apache Spark" +original_id: adaptors-spark +--- + +## Spark Streaming receiver +The Spark Streaming receiver for Pulsar is a custom receiver that enables Apache [Spark Streaming](https://spark.apache.org/streaming/) to receive raw data from Pulsar. + +An application can receive data in [Resilient Distributed Dataset](https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds) (RDD) format via the Spark Streaming receiver and can process it in a variety of ways. + +### Prerequisites + +To use the receiver, include a dependency for the `pulsar-spark` library in your Java configuration. + +#### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-spark + ${pulsar.version} + + +``` + +#### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy + +def pulsarVersion = "@pulsar:version@" + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-spark', version: pulsarVersion +} + +``` + +### Usage + +Pass an instance of `SparkStreamingPulsarReceiver` to the `receiverStream` method in `JavaStreamingContext`: + +```java + + String serviceUrl = "pulsar://localhost:6650/"; + String topic = "persistent://public/default/test_src"; + String subs = "test_sub"; + + SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("Pulsar Spark Example"); + + JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(60)); + + ConsumerConfigurationData pulsarConf = new ConsumerConfigurationData(); + + Set set = new HashSet(); + set.add(topic); + pulsarConf.setTopicNames(set); + pulsarConf.setSubscriptionName(subs); + + SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationDisabled()); + + JavaReceiverInputDStream lineDStream = jsc.receiverStream(pulsarReceiver); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/examples/spark/src/main/java/org/apache/spark/streaming/receiver/example/SparkStreamingPulsarReceiverExample.java). In this example, the number of messages that contain the string "Pulsar" in received messages is counted. + +Note that if needed, other Pulsar authentication classes can be used. For example, in order to use a token during authentication the following parameters for the `SparkStreamingPulsarReceiver` constructor can be set: + +```java + +SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationToken("token:")); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/adaptors-storm.md b/site2/website/versioned_docs/version-2.8.x/adaptors-storm.md new file mode 100644 index 0000000000000..76d507164777d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/adaptors-storm.md @@ -0,0 +1,96 @@ +--- +id: adaptors-storm +title: Pulsar adaptor for Apache Storm +sidebar_label: "Apache Storm" +original_id: adaptors-storm +--- + +Pulsar Storm is an adaptor for integrating with [Apache Storm](http://storm.apache.org/) topologies. It provides core Storm implementations for sending and receiving data. + +An application can inject data into a Storm topology via a generic Pulsar spout, as well as consume data from a Storm topology via a generic Pulsar bolt. + +## Using the Pulsar Storm Adaptor + +Include dependency for Pulsar Storm Adaptor: + +```xml + + + org.apache.pulsar + pulsar-storm + ${pulsar.version} + + +``` + +## Pulsar Spout + +The Pulsar Spout allows for the data published on a topic to be consumed by a Storm topology. It emits a Storm tuple based on the message received and the `MessageToValuesMapper` provided by the client. + +The tuples that fail to be processed by the downstream bolts will be re-injected by the spout with an exponential backoff, within a configurable timeout (the default is 60 seconds) or a configurable number of retries, whichever comes first, after which it is acknowledged by the consumer. Here's an example construction of a spout: + +```java + +MessageToValuesMapper messageToValuesMapper = new MessageToValuesMapper() { + + @Override + public Values toValues(Message msg) { + return new Values(new String(msg.getData())); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + declarer.declare(new Fields("string")); + } +}; + +// Configure a Pulsar Spout +PulsarSpoutConfiguration spoutConf = new PulsarSpoutConfiguration(); +spoutConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +spoutConf.setTopic("persistent://my-property/usw/my-ns/my-topic1"); +spoutConf.setSubscriptionName("my-subscriber-name1"); +spoutConf.setMessageToValuesMapper(messageToValuesMapper); + +// Create a Pulsar Spout +PulsarSpout spout = new PulsarSpout(spoutConf); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/pulsar-storm/src/test/java/org/apache/pulsar/storm/PulsarSpoutTest.java). + +## Pulsar Bolt + +The Pulsar bolt allows data in a Storm topology to be published on a topic. It publishes messages based on the Storm tuple received and the `TupleToMessageMapper` provided by the client. + +A partitioned topic can also be used to publish messages on different topics. In the implementation of the `TupleToMessageMapper`, a "key" will need to be provided in the message which will send the messages with the same key to the same topic. Here's an example bolt: + +```java + +TupleToMessageMapper tupleToMessageMapper = new TupleToMessageMapper() { + + @Override + public TypedMessageBuilder toMessage(TypedMessageBuilder msgBuilder, Tuple tuple) { + String receivedMessage = tuple.getString(0); + // message processing + String processedMsg = receivedMessage + "-processed"; + return msgBuilder.value(processedMsg.getBytes()); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + } +}; + +// Configure a Pulsar Bolt +PulsarBoltConfiguration boltConf = new PulsarBoltConfiguration(); +boltConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +boltConf.setTopic("persistent://my-property/usw/my-ns/my-topic2"); +boltConf.setTupleToMessageMapper(tupleToMessageMapper); + +// Create a Pulsar Bolt +PulsarBolt bolt = new PulsarBolt(boltConf); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-brokers.md b/site2/website/versioned_docs/version-2.8.x/admin-api-brokers.md new file mode 100644 index 0000000000000..4af4363850efd --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-brokers.md @@ -0,0 +1,276 @@ +--- +id: admin-api-brokers +title: Managing Brokers +sidebar_label: "Brokers" +original_id: admin-api-brokers +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar brokers consist of two components: + +1. An HTTP server exposing a {@inject: rest:REST:/} interface administration and [topic](reference-terminology.md#topic) lookup. +2. A dispatcher that handles all Pulsar [message](reference-terminology.md#message) transfers. + +[Brokers](reference-terminology.md#broker) can be managed via: + +* The [`brokers`](reference-pulsar-admin.md#brokers) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/v2/brokers` endpoint of the admin {@inject: rest:REST:/} API +* The `brokers` method of the {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin.html} object in the [Java API](client-libraries-java.md) + +In addition to being configurable when you start them up, brokers can also be [dynamically configured](#dynamic-broker-configuration). + +> See the [Configuration](reference-configuration.md#broker) page for a full listing of broker-specific configuration parameters. + +## Brokers resources + +### List active brokers + +Fetch all available active brokers that are serving traffic with cluster name. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list use + +``` + +``` + +broker1.use.org.com:8080 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster|operation/getActiveBrokers?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getActiveBrokers(clusterName) + +``` + + + + +```` + +### Get the information of the leader broker + +Fetch the information of the leader broker, for example, the service url. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers leader-broker + +``` + +``` + +BrokerInfo(serviceUrl=broker1.use.org.com:8080) + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/leaderBroker|operation/getLeaderBroker?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getLeaderBroker() + +``` + +For the detail of the code above, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/BrokersImpl.java#L80) + + + + +```` + +#### list of namespaces owned by a given broker + +It finds all namespaces which are owned and served by a given broker. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers namespaces use \ + --url broker1.use.org.com:8080 + +``` + +```json + +{ + "my-property/use/my-ns/0x00000000_0xffffffff": { + "broker_assignment": "shared", + "is_controlled": false, + "is_active": true + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster/:broker/ownedNamespaces|operation/getOwnedNamespaes?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getOwnedNamespaces(cluster,brokerUrl); + +``` + + + + +```` + +### Dynamic broker configuration + +One way to configure a Pulsar [broker](reference-terminology.md#broker) is to supply a [configuration](reference-configuration.md#broker) when the broker is [started up](reference-cli-tools.md#pulsar-broker). + +But since all broker configuration in Pulsar is stored in ZooKeeper, configuration values can also be dynamically updated *while the broker is running*. When you update broker configuration dynamically, ZooKeeper will notify the broker of the change and the broker will then override any existing configuration values. + +* The [`brokers`](reference-pulsar-admin.md#brokers) command for the [`pulsar-admin`](reference-pulsar-admin.md) tool has a variety of subcommands that enable you to manipulate a broker's configuration dynamically, enabling you to [update config values](#update-dynamic-configuration) and more. +* In the Pulsar admin {@inject: rest:REST:/} API, dynamic configuration is managed through the `/admin/v2/brokers/configuration` endpoint. + +### Update dynamic configuration + +````mdx-code-block + + + +The [`update-dynamic-config`](reference-pulsar-admin.md#brokers-update-dynamic-config) subcommand will update existing configuration. It takes two arguments: the name of the parameter and the new value using the `config` and `value` flag respectively. Here's an example for the [`brokerShutdownTimeoutMs`](reference-configuration.md#broker-brokerShutdownTimeoutMs) parameter: + +```shell + +$ pulsar-admin brokers update-dynamic-config --config brokerShutdownTimeoutMs --value 100 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/brokers/configuration/:configName/:configValue|operation/updateDynamicConfiguration?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().updateDynamicConfiguration(configName, configValue); + +``` + + + + +```` + +### List updated values + +Fetch a list of all potentially updatable configuration parameters. +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list-dynamic-config +brokerShutdownTimeoutMs + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration|operation/getDynamicConfigurationName?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getDynamicConfigurationNames(); + +``` + + + + +```` + +### List all + +Fetch a list of all parameters that have been dynamically updated. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers get-all-dynamic-config +brokerShutdownTimeoutMs:100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration/values|operation/getAllDynamicConfigurations?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getAllDynamicConfigurations(); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-clusters.md b/site2/website/versioned_docs/version-2.8.x/admin-api-clusters.md new file mode 100644 index 0000000000000..e0e9fb5f91f65 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-clusters.md @@ -0,0 +1,308 @@ +--- +id: admin-api-clusters +title: Managing Clusters +sidebar_label: "Clusters" +original_id: admin-api-clusters +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar clusters consist of one or more Pulsar [brokers](reference-terminology.md#broker), one or more [BookKeeper](reference-terminology.md#bookkeeper) +servers (aka [bookies](reference-terminology.md#bookie)), and a [ZooKeeper](https://zookeeper.apache.org) cluster that provides configuration and coordination management. + +Clusters can be managed via: + +* The [`clusters`](reference-pulsar-admin.md#clusters) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/v2/clusters` endpoint of the admin {@inject: rest:REST:/} API +* The `clusters` method of the {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object in the [Java API](client-libraries-java.md) + +## Clusters resources + +### Provision + +New clusters can be provisioned using the admin interface. + +> Please note that this operation requires superuser privileges. + +````mdx-code-block + + + +You can provision a new cluster using the [`create`](reference-pulsar-admin.md#clusters-create) subcommand. Here's an example: + +```shell + +$ pulsar-admin clusters create cluster-1 \ + --url http://my-cluster.org.com:8080 \ + --broker-url pulsar://my-cluster.org.com:6650 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/clusters/:cluster|operation/createCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().createCluster(clusterName, clusterData); + +``` + + + + +```` + +### Initialize cluster metadata + +When provision a new cluster, you need to initialize that cluster's [metadata](concepts-architecture-overview.md#metadata-store). When initializing cluster metadata, you need to specify all of the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +You must initialize cluster metadata *before* starting up any [brokers](admin-api-brokers.md) that will belong to the cluster. + +> **No cluster metadata initialization through the REST API or the Java admin API** +> +> Unlike most other admin functions in Pulsar, cluster metadata initialization cannot be performed via the admin REST API +> or the admin Java client, as metadata initialization involves communicating with ZooKeeper directly. +> Instead, you can use the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool, in particular +> the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command. + +Here's an example cluster metadata initialization command: + +```shell + +bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +You'll need to use `--*-tls` flags only if you're using [TLS authentication](security-tls-authentication.md) in your instance. + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#clusters-get) subcommand and specify the name of the cluster. Here's an example: + +```shell + +$ pulsar-admin clusters get cluster-1 +{ + "serviceUrl": "http://my-cluster.org.com:8080/", + "serviceUrlTls": null, + "brokerServiceUrl": "pulsar://my-cluster.org.com:6650/", + "brokerServiceUrlTls": null + "peerClusterNames": null +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters/:cluster|operation/getCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getCluster(clusterName); + +``` + + + + +```` + +### Update + +You can update the configuration for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#clusters-update) subcommand and specify new configuration values using flags. + +```shell + +$ pulsar-admin clusters update cluster-1 \ + --url http://my-cluster.org.com:4081 \ + --broker-url pulsar://my-cluster.org.com:3350 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster|operation/updateCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().updateCluster(clusterName, clusterData); + +``` + + + + +```` + +### Delete + +Clusters can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#clusters-delete) subcommand and specify the name of the cluster. + +``` + +$ pulsar-admin clusters delete cluster-1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/clusters/:cluster|operation/deleteCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().deleteCluster(clusterName); + +``` + + + + +```` + +### List + +You can fetch a list of all clusters in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#clusters-list) subcommand. + +```shell + +$ pulsar-admin clusters list +cluster-1 +cluster-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters|operation/getClusters?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getClusters(); + +``` + + + + +```` + +### Update peer-cluster data + +Peer clusters can be configured for a given cluster in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`update-peer-clusters`](reference-pulsar-admin.md#clusters-update-peer-clusters) subcommand and specify the list of peer-cluster names. + +``` + +$ pulsar-admin update-peer-clusters cluster-1 --peer-clusters cluster-2 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster/peers|operation/setPeerClusterNames?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().updatePeerClusterNames(clusterName, peerClusterList); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-functions.md b/site2/website/versioned_docs/version-2.8.x/admin-api-functions.md new file mode 100644 index 0000000000000..93d41ac257301 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-functions.md @@ -0,0 +1,820 @@ +--- +id: admin-api-functions +title: Manage Functions +sidebar_label: "Functions" +original_id: admin-api-functions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics +* apply a user-supplied processing logic to each message +* publish the results of the computation to another topic + +Functions can be managed via the following methods. + +Method | Description +---|--- +**Admin CLI** | The [`functions`](reference-pulsar-admin.md#functions) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool. +**REST API** |The `/admin/v3/functions` endpoint of the admin {@inject: rest:REST:/} API. +**Java Admin API**| The `functions` method of the {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object in the [Java API](client-libraries-java.md). + +## Function resources + +You can perform the following operations on functions. + +### Create a function + +You can create a Pulsar function in cluster mode (deploy it on a Pulsar cluster) using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#functions-create) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --inputs test-input-topic \ + --output persistent://public/default/test-output-topic \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --jar /examples/api-examples.jar + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName|operation/registerFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +functionConfig.setProcessingGuarantees(FunctionConfig.ProcessingGuarantees.ATLEAST_ONCE); +functionConfig.setTopicsPattern(sourceTopicPattern); +functionConfig.setSubName(subscriptionName); +functionConfig.setAutoAck(true); +functionConfig.setOutput(sinkTopic); +admin.functions().createFunction(functionConfig, fileName); + +``` + + + + +```` + +### Update a function + +You can update a Pulsar function that has been deployed to a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#functions-update) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions update \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --output persistent://public/default/update-output-topic \ + # other options + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/functions/:tenant/:namespace/:functionName|operation/updateFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +UpdateOptions updateOptions = new UpdateOptions(); +updateOptions.setUpdateAuthData(updateAuthData); +admin.functions().updateFunction(functionConfig, userCodeFile, updateOptions); + +``` + + + + +```` + +### Start an instance of a function + +You can start a stopped function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Start all instances of a function + +You can start all stopped function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Stop an instance of a function + +You can stop a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Stop all instances of a function + +You can stop all function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Restart an instance of a function + +Restart a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Restart all instances of a function + +You can restart all function instances using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### List all functions + +You can list all Pulsar functions running under a specific tenant and namespace using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#functions-list) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace|operation/listFunctions?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctions(tenant, namespace); + +``` + + + + +```` + +### Delete a function + +You can delete a Pulsar function that is running on a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#functions-delete) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions delete \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/functions/:tenant/:namespace/:functionName|operation/deregisterFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().deleteFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get info about a function + +You can get information about a Pulsar function currently running in cluster mode using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#functions-get) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName|operation/getFunctionInfo?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get status of an instance of a function + +You can get the current status of a Pulsar function instance with `instance-id` using Admin CLI, REST API or Java Admin API. +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/status|operation/getFunctionInstanceStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get status of all instances of a function + +You can get the current status of a Pulsar function instance using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/status|operation/getFunctionStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName); + +``` + + + + +```` + +### Get stats of an instance of a function + +You can get the current stats of a Pulsar Function instance with `instance-id` using Admin CLI, REST API or Java admin API. +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stats|operation/getFunctionInstanceStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get stats of all instances of a function + +You can get the current stats of a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/stats|operation/getFunctionStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName); + +``` + + + + +```` + +### Trigger a function + +You can trigger a specified Pulsar function with a supplied value using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`trigger`](reference-pulsar-admin.md#functions-trigger) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --topic (the name of input topic) \ + --trigger-value \"hello pulsar\" + # or --trigger-file (the path of trigger file) + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/trigger|operation/triggerFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().triggerFunction(tenant, namespace, functionName, topic, triggerValue, triggerFile); + +``` + + + + +```` + +### Put state associated with a function + +You can put the state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`putstate`](reference-pulsar-admin.md#functions-putstate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions putstate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --state "{\"key\":\"pulsar\", \"stringValue\":\"hello pulsar\"}" + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/putFunctionState?version=@pulsar:version_number@} + + + + +```java + +TypeReference typeRef = new TypeReference() {}; +FunctionState stateRepr = ObjectMapperFactory.getThreadLocal().readValue(state, typeRef); +admin.functions().putFunctionState(tenant, namespace, functionName, stateRepr); + +``` + + + + +```` + +### Fetch state associated with a function + +You can fetch the current state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`querystate`](reference-pulsar-admin.md#functions-querystate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions querystate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --key (the key of state) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/getFunctionState?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionState(tenant, namespace, functionName, key); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-namespaces.md b/site2/website/versioned_docs/version-2.8.x/admin-api-namespaces.md new file mode 100644 index 0000000000000..9cb387041f11c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-namespaces.md @@ -0,0 +1,1315 @@ +--- +id: admin-api-namespaces +title: Managing Namespaces +sidebar_label: "Namespaces" +original_id: admin-api-namespaces +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar [namespaces](reference-terminology.md#namespace) are logical groupings of [topics](reference-terminology.md#topic). + +Namespaces can be managed via: + +* The [`namespaces`](reference-pulsar-admin.md#clusters) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool +* The `/admin/v2/namespaces` endpoint of the admin {@inject: rest:REST:/} API +* The `namespaces` method of the {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object in the [Java API](client-libraries-java.md) + +## Namespaces resources + +### Create namespaces + +You can create new namespaces under a given [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#namespaces-create) subcommand and specify the namespace by name: + +```shell + +$ pulsar-admin namespaces create test-tenant/test-namespace + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace|operation/createNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().createNamespace(namespace); + +``` + + + + +```` + +### Get policies + +You can fetch the current policies associated with a namespace at any time. + +````mdx-code-block + + + +Use the [`policies`](reference-pulsar-admin.md#namespaces-policies) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces policies test-tenant/test-namespace +{ + "auth_policies": { + "namespace_auth": {}, + "destination_auth": {} + }, + "replication_clusters": [], + "bundles_activated": true, + "bundles": { + "boundaries": [ + "0x00000000", + "0xffffffff" + ], + "numBundles": 1 + }, + "backlog_quota_map": {}, + "persistence": null, + "latency_stats_sample_rate": {}, + "message_ttl_in_seconds": 0, + "retention_policies": null, + "deleted": false +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace|operation/getPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPolicies(namespace); + +``` + + + + +```` + +### List namespaces + +You can list all namespaces within a given Pulsar [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#namespaces-list) subcommand and specify the tenant: + +```shell + +$ pulsar-admin namespaces list test-tenant +test-tenant/ns1 +test-tenant/ns2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant|operation/getTenantNamespaces?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaces(tenant); + +``` + + + + +```` + +### Delete namespaces + +You can delete existing namespaces from a tenant. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#namespaces-delete) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces delete test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace|operation/deleteNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().deleteNamespace(namespace); + +``` + + + + +```` + +### Configure replication clusters + +#### Set replication cluster + +It sets replication clusters for a namespace, so Pulsar can internally replicate publish message from one colo to another colo. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-clusters test-tenant/ns1 \ + --clusters cl1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replication|operation/setNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceReplicationClusters(namespace, clusters); + +``` + + + + +```` + +#### Get replication cluster + +It gives a list of replication clusters for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-clusters test-tenant/cl1/ns1 + +``` + +``` + +cl2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replication|operation/getNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceReplicationClusters(namespace) + +``` + + + + +```` + +### Configure backlog quota policies + +#### Set backlog quota policies + +Backlog quota helps the broker to restrict bandwidth/storage of a namespace once it reaches a certain threshold limit. Admin can set the limit and take corresponding action after the limit is reached. + + 1. producer_request_hold: broker will hold and not persist produce request payload + + 2. producer_exception: broker disconnects with the client by giving an exception. + + 3. consumer_backlog_eviction: broker will start discarding backlog messages + + Backlog quota restriction can be taken care by defining restriction of backlog-quota-type: destination_storage + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-backlog-quota --limit 10G --limitTime 36000 --policy producer_request_hold test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/setBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setBacklogQuota(namespace, new BacklogQuota(limit, limitTime, policy)) + +``` + + + + +```` + +#### Get backlog quota policies + +It shows a configured backlog quota for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-backlog-quotas test-tenant/ns1 + +``` + +```json + +{ + "destination_storage": { + "limit": 10, + "policy": "producer_request_hold" + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getBacklogQuotaMap(namespace); + +``` + + + + +```` + +#### Remove backlog quota policies + +It removes backlog quota policies for a given namespace + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-backlog-quota test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeBacklogQuota(namespace, backlogQuotaType) + +``` + + + + +```` + +### Configure persistence policies + +#### Set persistence policies + +Persistence policies allow to configure persistency-level for all topic messages under a given namespace. + + - Bookkeeper-ack-quorum: Number of acks (guaranteed copies) to wait for each entry, default: 0 + + - Bookkeeper-ensemble: Number of bookies to use for a topic, default: 0 + + - Bookkeeper-write-quorum: How many writes to make of each entry, default: 0 + + - Ml-mark-delete-max-rate: Throttling rate of mark-delete operation (0 means no throttle), default: 0.0 + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-persistence --bookkeeper-ack-quorum 2 --bookkeeper-ensemble 3 --bookkeeper-write-quorum 2 --ml-mark-delete-max-rate 0 test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setPersistence(namespace,new PersistencePolicies(bookkeeperEnsemble, bookkeeperWriteQuorum,bookkeeperAckQuorum,managedLedgerMaxMarkDeleteRate)) + +``` + + + + +```` + +#### Get persistence policies + +It shows the configured persistence policies of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-persistence test-tenant/ns1 + +``` + +```json + +{ + "bookkeeperEnsemble": 3, + "bookkeeperWriteQuorum": 2, + "bookkeeperAckQuorum": 2, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPersistence(namespace) + +``` + + + + +```` + +### Configure namespace bundles + +#### Unload namespace bundles + +The namespace bundle is a virtual group of topics which belong to the same namespace. If the broker gets overloaded with the number of bundles, this command can help unload a bundle from that broker, so it can be served by some other less-loaded brokers. The namespace bundle ID ranges from 0x00000000 to 0xffffffff. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces unload --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/unload|operation/unloadNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unloadNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +#### Split namespace bundles + +Each namespace bundle can contain multiple topics and each bundle can be served by only one broker. +If a single bundle is creating an excessive load on a broker, an admin splits the bundle using this command permitting one or more of the new bundles to be unloaded thus spreading the load across the brokers. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces split-bundle --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/split|operation/splitNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().splitNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +### Configure message TTL + +#### Set message-ttl + +It configures message’s time to live (in seconds) duration. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-message-ttl --messageTTL 100 test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, messageTTL) + +``` + + + + +```` + +#### Get message-ttl + +It gives a message ttl of configured namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-message-ttl test-tenant/ns1 + +``` + +``` + +100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + + + + +```` + +#### Remove message-ttl + +Remove a message TTL of the configured namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-message-ttl test-tenant/ns1 + +``` + +``` + +100 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + + + + +```` + + +### Clear backlog + +#### Clear namespace backlog + +It clears all message backlog for all the topics that belong to a specific namespace. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --sub my-subscription test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/clearBacklog|operation/clearNamespaceBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBacklogForSubscription(namespace, subscription) + +``` + + + + +```` + +#### Clear bundle backlog + +It clears all message backlog for all the topics that belong to a specific NamespaceBundle. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --bundle 0x00000000_0xffffffff --sub my-subscription test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/:bundle/clearBacklog|operation/clearNamespaceBundleBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBundleBacklogForSubscription(namespace, bundle, subscription) + +``` + + + + +```` + +### Configure retention + +#### Set retention + +Each namespace contains multiple topics and the retention size (storage size) of each topic should not exceed a specific threshold or it should be stored for a certain period. This command helps configure the retention size and time of topics in a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin set-retention --size 100 --time 10 test-tenant/ns1 + +``` + +``` + +N/A + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setRetention(namespace, new RetentionPolicies(retentionTimeInMin, retentionSizeInMB)) + +``` + + + + +```` + +#### Get retention + +It shows retention information of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-retention test-tenant/ns1 + +``` + +```json + +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 100 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getRetention(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for topics + +#### Set dispatch throttling for topics + +It sets message dispatch rate for all the topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +:::note + +- If neither `clusterDispatchRate` nor `topicDispatchRate` is configured, dispatch throttling is disabled. + +- If `topicDispatchRate` is not configured, `clusterDispatchRate` takes effect. + +- If `topicDispatchRate` is configured, `topicDispatchRate` takes effect. + +::: + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for topics + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for subscription + +#### Set dispatch throttling for subscription + +It sets message dispatch rate for all the subscription of topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-subscription-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setSubscriptionDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for subscription + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-subscription-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getSubscriptionDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for replicator + +#### Set dispatch throttling for replicator + +It sets message dispatch rate for all the replicator between replication clusters under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-replicator-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setReplicatorDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for replicator + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-replicator-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getReplicatorDispatchRate(namespace) + +``` + + + + +```` + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +It shows configured `deduplicationSnapshotInterval` for a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +Set configured `deduplicationSnapshotInterval` for a namespace. Each topic under the namespace will take a deduplication snapshot according to this interval. +`brokerDeduplicationEnabled` must be set to `true` for this property to take effect. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-deduplication-snapshot-interval test-tenant/ns1 --interval 1000 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDeduplicationSnapshotInterval(namespace, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +Remove configured `deduplicationSnapshotInterval` of a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + + +```java + +admin.namespaces().removeDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +### Namespace isolation + +You can use the [Pulsar isolation policy](administration-isolation.md) to allocate resources (broker and bookie) for a namespace. + +### Unload namespaces from a broker + +You can unload a namespace, or a [namespace bundle](reference-terminology.md#namespace-bundle), from the Pulsar [broker](reference-terminology.md#broker) that is currently responsible for it. + +#### pulsar-admin + +Use the [`unload`](reference-pulsar-admin.md#unload) subcommand of the [`namespaces`](reference-pulsar-admin.md#namespaces) command. + +````mdx-code-block + + + +```shell + +$ pulsar-admin namespaces unload my-tenant/my-ns + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/unload|operation/unloadNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unload(namespace) + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-non-partitioned-topics.md b/site2/website/versioned_docs/version-2.8.x/admin-api-non-partitioned-topics.md new file mode 100644 index 0000000000000..e6347bb8c363a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-non-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-partitioned-topics +title: Managing non-partitioned topics +sidebar_label: "Non-partitioned topics" +original_id: admin-api-non-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-non-persistent-topics.md b/site2/website/versioned_docs/version-2.8.x/admin-api-non-persistent-topics.md new file mode 100644 index 0000000000000..3126a6494c715 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-non-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-persistent-topics +title: Managing non-persistent topics +sidebar_label: "Non-Persistent topics" +original_id: admin-api-non-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-overview.md b/site2/website/versioned_docs/version-2.8.x/admin-api-overview.md new file mode 100644 index 0000000000000..81e6587fab350 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-overview.md @@ -0,0 +1,133 @@ +--- +id: admin-api-overview +title: Pulsar admin interface +sidebar_label: "Overview" +original_id: admin-api-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +The Pulsar admin interface enables you to manage all important entities in a Pulsar instance, such as tenants, topics, and namespaces. + +You can interact with the admin interface via: + +- HTTP calls, which are made against the admin {@inject: rest:REST:/} API provided by Pulsar brokers. For some RESTful APIs, they might be redirected to the owner brokers for serving with [`307 Temporary Redirect`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/307), hence the HTTP callers should handle `307 Temporary Redirect`. If you use `curl` commands, you should specify `-L` to handle redirections. +- A Java client interface. +- The `pulsar-admin` CLI tool, which is available in the `bin` folder of your Pulsar installation: + + ```shell + + $ bin/pulsar-admin + + ``` + + For complete commands of `pulsar-admin` tool, see [Pulsar admin snapshot](https://pulsar.apache.org/tools/pulsar-admin/). + + +> **The REST API is the admin interface**. Both the `pulsar-admin` CLI tool and the Java client use the REST API. If you implement your own admin interface client, you should use the REST API. + +## Admin setup + +Each of the three admin interfaces (the `pulsar-admin` CLI tool, the {@inject: rest:REST:/} API, and the [Java admin API](/api/admin)) requires some special setup if you have enabled authentication in your Pulsar instance. + +````mdx-code-block + + + +If you have enabled authentication, you need to provide an auth configuration to use the `pulsar-admin` tool. By default, the configuration for the `pulsar-admin` tool is in the [`conf/client.conf`](reference-configuration.md#client) file. The following are the available parameters: + +|Name|Description|Default| +|----|-----------|-------| +|webServiceUrl|The web URL for the cluster.|http://localhost:8080/| +|brokerServiceUrl|The Pulsar protocol URL for the cluster.|pulsar://localhost:6650/| +|authPlugin|The authentication plugin.| | +|authParams|The authentication parameters for the cluster, as a comma-separated string.| | +|useTls|Whether or not TLS authentication will be enforced in the cluster.|false| +|tlsAllowInsecureConnection|Accept untrusted TLS certificate from client.|false| +|tlsTrustCertsFilePath|Path for the trusted TLS certificate file.| | + + + + +You can find details for the REST API exposed by Pulsar brokers in this {@inject: rest:document:/}. + + + + +To use the Java admin API, instantiate a {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object, and specify a URL for a Pulsar broker and a {@inject: javadoc:PulsarAdminBuilder:/admin/org/apache/pulsar/client/admin/PulsarAdminBuilder}. The following is a minimal example using `localhost`: + +```java + +String url = "http://localhost:8080"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + +If you use multiple brokers, you can use multi-host like Pulsar service. For example, + +```java + +String url = "http://localhost:8080,localhost:8081,localhost:8082"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + + + + +```` + +## How to define Pulsar resource names when running Pulsar in Kubernetes +If you run Pulsar Functions or connectors on Kubernetes, you need to follow Kubernetes naming convention to define the names of your Pulsar resources, whichever admin interface you use. + +Kubernetes requires a name that can be used as a DNS subdomain name as defined in [RFC 1123](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). Pulsar supports more legal characters than Kubernetes naming convention. If you create a Pulsar resource name with special characters that are not supported by Kubernetes (for example, including colons in a Pulsar namespace name), Kubernetes runtime translates the Pulsar object names into Kubernetes resource labels which are in RFC 1123-compliant forms. Consequently, you can run functions or connectors using Kubernetes runtime. The rules for translating Pulsar object names into Kubernetes resource labels are as below: + +- Truncate to 63 characters + +- Replace the following characters with dashes (-): + + - Non-alphanumeric characters + + - Underscores (_) + + - Dots (.) + +- Replace beginning and ending non-alphanumeric characters with 0 + +:::tip + +- If you get an error in translating Pulsar object names into Kubernetes resource labels (for example, you may have a naming collision if your Pulsar object name is too long) or want to customize the translating rules, see [customize Kubernetes runtime](https://pulsar.apache.org/docs/en/next/functions-runtime/#customize-kubernetes-runtime). +- For how to configure Kubernetes runtime, see [here](https://pulsar.apache.org/docs/en/next/functions-runtime/#configure-kubernetes-runtime). + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-packages.md b/site2/website/versioned_docs/version-2.8.x/admin-api-packages.md new file mode 100644 index 0000000000000..70bebfcf35dfe --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-packages.md @@ -0,0 +1,381 @@ +--- +id: admin-api-packages +title: Manage packages +sidebar_label: "Packages" +original_id: admin-api-packages +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Package management enables version management and simplifies the upgrade and rollback processes for Functions, Sinks, and Sources. When you use the same function, sink and source in different namespaces, you can upload them to a common package management system. + +## Package name + +A `package` is identified by five parts: `type`, `tenant`, `namespace`, `package name`, and `version`. + +| Part | Description | +|-------|-------------| +|`type` |The type of the package. The following types are supported: `function`, `sink` and `source`. | +| `name`|The fully qualified name of the package: `//`.| +|`version`|The version of the package.| + +The following is a code sample. + +```java + +class PackageName { + private final PackageType type; + private final String namespace; + private final String tenant; + private final String name; + private final String version; +} + +enum PackageType { + FUNCTION("function"), SINK("sink"), SOURCE("source"); +} + +``` + +## Package URL +A package is located using a URL. The package URL is written in the following format: + +```shell + +:////@ + +``` + +The following are package URL examples: + +`sink://public/default/mysql-sink@1.0` +`function://my-tenant/my-ns/my-function@0.1` +`source://my-tenant/my-ns/mysql-cdc-source@2.3` + +The package management system stores the data, versions and metadata of each package. The metadata is shown in the following table. + +| metadata | Description | +|----------|-------------| +|description|The description of the package.| +|contact |The contact information of a package. For example, team email.| +|create_time| The time when the package is created.| +|modification_time| The time when the package is modified.| +|properties |A key/value map that stores your own information.| + +## Permissions + +The packages are organized by the tenant and namespace, so you can apply the tenant and namespace permissions to packages directly. + +## Package resources +You can use the package management with command line tools, REST API and Java client. + +### Upload a package +You can upload a package to the package management service in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages upload function://public/default/example@v0.1 --path package-file --description package-description + +``` + + + + +{@inject: endpoint|POST|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/upload?version=@pulsar:version_number@} + + + + +Upload a package to the package management service synchronously. + +```java + + void upload(PackageMetadata metadata, String packageName, String path) throws PulsarAdminException; + +``` + +Upload a package to the package management service asynchronously. + +```java + + CompletableFuture uploadAsync(PackageMetadata metadata, String packageName, String path); + +``` + + + + +```` + +### Download a package +You can download a package to the package management service in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages download function://public/default/example@v0.1 --path package-file + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/download?version=@pulsar:version_number@} + + + + +Download a package to the package management service synchronously. + +```java + + void download(String packageName, String path) throws PulsarAdminException; + +``` + +Download a package to the package management service asynchronously. + +```java + + CompletableFuture downloadAsync(String packageName, String path); + +``` + + + + +```` + +### List all versions of a package +You can get a list of all versions of a package in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages list --type function public/default + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName|operation/listPackageVersion?version=@pulsar:version_number@} + + + + +List all versions of a package synchronously. + +```java + + List listPackageVersions(String packageName) throws PulsarAdminException; + +``` + +List all versions of a package asynchronously. + +```java + + CompletableFuture> listPackageVersionsAsync(String packageName); + +``` + + + + +```` + +### List all the specified type packages under a namespace +You can get a list of all the packages with the given type in a namespace in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages list --type function public/default + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace|operation/listPackages?version=@pulsar:version_number@} + + + + +List all the packages with the given type in a namespace synchronously. + +```java + + List listPackages(String type, String namespace) throws PulsarAdminException; + +``` + +List all the packages with the given type in a namespace asynchronously. + +```java + + CompletableFuture> listPackagesAsync(String type, String namespace); + +``` + + + + +```` + +### Get the metadata of a package +You can get the metadata of a package in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages get-metadata function://public/default/test@v1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/getMeta?version=@pulsar:version_number@} + + + + +Get the metadata of a package synchronously. + +```java + + PackageMetadata getMetadata(String packageName) throws PulsarAdminException; + +``` + +Get the metadata of a package asynchronously. + +```java + + CompletableFuture getMetadataAsync(String packageName); + +``` + + + + +```` + +### Update the metadata of a package +You can update the metadata of a package in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages update-metadata function://public/default/example@v0.1 --description update-description + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/updateMeta?version=@pulsar:version_number@} + + + + +Update a package metadata information synchronously. + +```java + + void updateMetadata(String packageName, PackageMetadata metadata) throws PulsarAdminException; + +``` + +Update a package metadata information asynchronously. + +```java + + CompletableFuture updateMetadataAsync(String packageName, PackageMetadata metadata); + +``` + + + + +```` + +### Delete a specified package +You can delete a specified package with its package name in the following ways. + +````mdx-code-block + + + +The following command example deletes a package of version 0.1. + +```shell + +bin/pulsar-admin packages delete function://public/default/example@v0.1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/delete?version=@pulsar:version_number@} + + + + +Delete a specified package synchronously. + +```java + + void delete(String packageName) throws PulsarAdminException; + +``` + +Delete a specified package asynchronously. + +```java + + CompletableFuture deleteAsync(String packageName); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-partitioned-topics.md b/site2/website/versioned_docs/version-2.8.x/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000..5ce182282e032 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: "Partitioned topics" +original_id: admin-api-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-permissions.md b/site2/website/versioned_docs/version-2.8.x/admin-api-permissions.md new file mode 100644 index 0000000000000..6897517553f2b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-permissions.md @@ -0,0 +1,174 @@ +--- +id: admin-api-permissions +title: Managing permissions +sidebar_label: "Permissions" +original_id: admin-api-permissions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Permissions in Pulsar are managed at the [namespace](reference-terminology.md#namespace) level +(that is, within [tenants](reference-terminology.md#tenant) and [clusters](reference-terminology.md#cluster)). + +## Grant permissions + +You can grant permissions to specific roles for lists of operations such as `produce` and `consume`. + +````mdx-code-block + + + +Use the [`grant-permission`](reference-pulsar-admin.md#grant-permission) subcommand and specify a namespace, actions using the `--actions` flag, and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role admin10 + +``` + +Wildcard authorization can be performed when `authorizationAllowWildcardsMatching` is set to `true` in `broker.conf`. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.role.*' + +``` + +Then, roles `my.role.1`, `my.role.2`, `my.role.foo`, `my.role.bar`, etc. can produce and consume. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role '*.role.my' + +``` + +Then, roles `1.role.my`, `2.role.my`, `foo.role.my`, `bar.role.my`, etc. can produce and consume. + +**Note**: A wildcard matching works at **the beginning or end of the role name only**. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.*.role' + +``` + +In this case, only the role `my.*.role` has permissions. +Roles `my.1.role`, `my.2.role`, `my.foo.role`, `my.bar.role`, etc. **cannot** produce and consume. + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/grantPermissionOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().grantPermissionOnNamespace(namespace, role, getAuthActions(actions)); + +``` + + + + +```` + +## Get permissions + +You can see which permissions have been granted to which roles in a namespace. + +````mdx-code-block + + + +Use the [`permissions`](reference-pulsar-admin#permissions) subcommand and specify a namespace: + +```shell + +$ pulsar-admin namespaces permissions test-tenant/ns1 +{ + "admin10": [ + "produce", + "consume" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/permissions|operation/getPermissions?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPermissions(namespace); + +``` + + + + +```` + +## Revoke permissions + +You can revoke permissions from specific roles, which means that those roles will no longer have access to the specified namespace. + +````mdx-code-block + + + +Use the [`revoke-permission`](reference-pulsar-admin.md#revoke-permission) subcommand and specify a namespace and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces revoke-permission test-tenant/ns1 \ + --role admin10 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/revokePermissionsOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().revokePermissionsOnNamespace(namespace, role); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-persistent-topics.md b/site2/website/versioned_docs/version-2.8.x/admin-api-persistent-topics.md new file mode 100644 index 0000000000000..50d135b72f542 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-persistent-topics +title: Managing persistent topics +sidebar_label: "Persistent topics" +original_id: admin-api-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-schemas.md b/site2/website/versioned_docs/version-2.8.x/admin-api-schemas.md new file mode 100644 index 0000000000000..9ffe21f5b0f75 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-schemas.md @@ -0,0 +1,7 @@ +--- +id: admin-api-schemas +title: Managing Schemas +sidebar_label: "Schemas" +original_id: admin-api-schemas +--- + diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-tenants.md b/site2/website/versioned_docs/version-2.8.x/admin-api-tenants.md new file mode 100644 index 0000000000000..d78aa2e55f4c3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-tenants.md @@ -0,0 +1,228 @@ +--- +id: admin-api-tenants +title: Managing Tenants +sidebar_label: "Tenants" +original_id: admin-api-tenants +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Tenants, like namespaces, can be managed using the [admin API](admin-api-overview.md). There are currently two configurable aspects of tenants: + +* Admin roles +* Allowed clusters + +## Tenant resources + +### List + +You can list all of the tenants associated with an [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#tenants-list) subcommand. + +```shell + +$ pulsar-admin tenants list +my-tenant-1 +my-tenant-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants|operation/getTenants?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenants(); + +``` + + + + +```` + +### Create + +You can create a new tenant. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#tenants-create) subcommand: + +```shell + +$ pulsar-admin tenants create my-tenant + +``` + +When creating a tenant, you can assign admin roles using the `-r`/`--admin-roles` flag. You can specify multiple roles as a comma-separated list. Here are some examples: + +```shell + +$ pulsar-admin tenants create my-tenant \ + --admin-roles role1,role2,role3 + +$ pulsar-admin tenants create my-tenant \ + -r role1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/tenants/:tenant|operation/createTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().createTenant(tenantName, tenantInfo); + +``` + + + + +```` + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing tenant at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#tenants-get) subcommand and specify the name of the tenant. Here's an example: + +```shell + +$ pulsar-admin tenants get my-tenant +{ + "adminRoles": [ + "admin1", + "admin2" + ], + "allowedClusters": [ + "cl1", + "cl2" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants/:cluster|operation/getTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenantInfo(tenantName); + +``` + + + + +```` + +### Delete + +Tenants can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#tenants-delete) subcommand and specify the name of the tenant. + +```shell + +$ pulsar-admin tenants delete my-tenant + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/tenants/:cluster|operation/deleteTenant?version=@pulsar:version_number@} + + + + +```java + +admin.Tenants().deleteTenant(tenantName); + +``` + + + + +```` + +### Update + +You can update a tenant's configuration. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#tenants-update) subcommand. + +```shell + +$ pulsar-admin tenants update my-tenant + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/tenants/:cluster|operation/updateTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().updateTenant(tenantName, tenantInfo); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/admin-api-topics.md b/site2/website/versioned_docs/version-2.8.x/admin-api-topics.md new file mode 100644 index 0000000000000..11553618ffaed --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/admin-api-topics.md @@ -0,0 +1,2175 @@ +--- +id: admin-api-topics +title: Manage topics +sidebar_label: "Topics" +original_id: admin-api-topics +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar has persistent and non-persistent topics. Persistent topic is a logical endpoint for publishing and consuming messages. The topic name structure for persistent topics is: + +```shell + +persistent://tenant/namespace/topic + +``` + +Non-persistent topics are used in applications that only consume real-time published messages and do not need persistent guarantee. In this way, it reduces message-publish latency by removing overhead of persisting messages. The topic name structure for non-persistent topics is: + +```shell + +non-persistent://tenant/namespace/topic + +``` + +## Manage topic resources +Whether it is persistent or non-persistent topic, you can obtain the topic resources through `pulsar-admin` tool, REST API and Java. + +:::note + +In REST API, `:schema` stands for persistent or non-persistent. `:tenant`, `:namespace`, `:x` are variables, replace them with the real tenant, namespace, and `x` names when using them. +Take {@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} as an example, to get the list of persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/persistent/my-tenant/my-namespace`. To get the list of non-persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/non-persistent/my-tenant/my-namespace`. + +::: + +### List of topics + +You can get the list of topics under a given namespace in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list \ + my-tenant/my-namespace + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +String namespace = "my-tenant/my-namespace"; +admin.topics().getList(namespace); + +``` + + + + +```` + +### Grant permission + +You can grant permissions on a client role to perform specific actions on a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics grant-permission \ + --actions produce,consume --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/grantPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +Set actions = Sets.newHashSet(AuthAction.produce, AuthAction.consume); +admin.topics().grantPermission(topic, role, actions); + +``` + + + + +```` + +### Get permission + +You can fetch permission in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics permissions \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/permissions|operation/getPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPermissions(topic); + +``` + + + + +```` + +### Revoke permission + +You can revoke a permission granted on a client role in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics revoke-permission \ + --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/revokePermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +admin.topics().revokePermissions(topic, role); + +``` + + + + +```` + +### Delete topic + +You can delete a topic in the following ways. You cannot delete a topic if any active subscription or producers is connected to the topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics delete \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().delete(topic); + +``` + + + + +```` + +### Unload topic + +You can unload a topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics unload \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/unload|operation/unloadTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().unload(topic); + +``` + + + + +```` + +### Get stats + +You can check the following statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates (msg/s). + + - **msgThroughputIn**: The sum of all local and replication publishers' publish rates (bytes/s). + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates(msg/s). + + - **msgThroughputOut**: The sum of all local and replication consumers' dispatch rates (bytes/s). + + - **averageMsgSize**: The average size (in bytes) of messages published within the last interval. + + - **storageSize**: The sum of the ledgers' storage size for this topic. The space used to store the messages for the topic. + + - **publishers**: The list of all local publishers into the topic. The list ranges from zero to thousands. + + - **msgRateIn**: The total rate of messages (msg/s) published by this publisher. + + - **msgThroughputIn**: The total throughput (bytes/s) of the messages published by this publisher. + + - **averageMsgSize**: The average message size in bytes from this publisher within the last interval. + + - **producerId**: The internal identifier for this producer on this topic. + + - **producerName**: The internal identifier for this producer, generated by the client library. + + - **address**: The IP address and source port for the connection of this producer. + + - **connectedSince**: The timestamp when this producer is created or reconnected last time. + + - **subscriptions**: The list of all local subscriptions to the topic. + + - **my-subscription**: The name of this subscription. It is defined by the client. + + - **msgRateOut**: The total rate of messages (msg/s) delivered on this subscription. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered on this subscription. + + - **msgBacklog**: The number of messages in the subscription backlog. + + - **type**: The subscription type. + + - **msgRateExpired**: The rate at which messages were discarded instead of dispatched from this subscription due to TTL. + + - **lastExpireTimestamp**: The timestamp of the last message expire execution. + + - **lastConsumedFlowTimestamp**: The timestamp of the last flow command received. + + - **lastConsumedTimestamp**: The latest timestamp of all the consumed timestamp of the consumers. + + - **lastAckedTimestamp**: The latest timestamp of all the acked timestamp of the consumers. + + - **consumers**: The list of connected consumers for this subscription. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the consumer. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the consumer. + + - **consumerName**: The internal identifier for this consumer, generated by the client library. + + - **availablePermits**: The number of messages that the consumer has space for in the client library's listen queue. `0` means the client library's queue is full and `receive()` isn't being called. A non-zero value means this consumer is ready for dispatched messages. + + - **unackedMessages**: The number of unacknowledged messages for the consumer, where an unacknowledged message is one that has been sent to the consumer but not yet acknowledged. This field is only meaningful when using a subscription that tracks individual message acknowledgement. + + - **blockedConsumerOnUnackedMsgs**: The flag used to verify if the consumer is blocked due to reaching threshold of the unacknowledged messages. + + - **lastConsumedTimestamp**: The timestamp when the consumer reads a message the last time. + + - **lastAckedTimestamp**: The timestamp when the consumer acknowledges a message the last time. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **msgRateIn**: The total rate (msg/s) of messages received from the remote cluster. + + - **msgThroughputIn**: The total throughput (bytes/s) received from the remote cluster. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the replication-subscriber. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the replication-subscriber. + + - **msgRateExpired**: The total rate of messages (msg/s) expired. + + - **replicationBacklog**: The number of messages pending to be replicated to remote cluster. + + - **connected**: Whether the outbound replicator is connected. + + - **replicationDelayInSeconds**: How long the oldest message has been waiting to be sent through the connection, if connected is `true`. + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker. + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **outboundConnection**: The address of the outbound replication connection. + + - **outboundConnectedSince**: The timestamp of establishing outbound connection. + +The following is an example of a topic status. + +```json + +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} + +``` + +To get the status of a topic, you can use the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getStats(topic); + +``` + + + + +```` + +### Get internal stats + +You can get the detailed statistics of a topic. + + - **entriesAddedCounter**: Messages published since this broker loaded this topic. + + - **numberOfEntries**: The total number of messages being tracked. + + - **totalSize**: The total storage size in bytes of all messages. + + - **currentLedgerEntries**: The count of messages written to the ledger that is currently open for writing. + + - **currentLedgerSize**: The size in bytes of messages written to the ledger that is currently open for writing. + + - **lastLedgerCreatedTimestamp**: The time when the last ledger is created. + + - **lastLedgerCreationFailureTimestamp:** The time when the last ledger failed. + + - **waitingCursorsCount**: The number of cursors that are "caught up" and waiting for a new message to be published. + + - **pendingAddEntriesCount**: The number of messages that complete (asynchronous) write requests. + + - **lastConfirmedEntry**: The ledgerid:entryid of the last message that is written successfully. If the entryid is `-1`, then the ledger is open, yet no entries are written. + + - **state**: The state of this ledger for writing. The state `LedgerOpened` means that a ledger is open for saving published messages. + + - **ledgers**: The ordered list of all ledgers for this topic holding messages. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **schemaLedgers**: The ordered list of all ledgers for this topic schema. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **compactedLedger**: The ledgers holding un-acked messages after topic compaction. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. The value is `false` for the compacted topic ledger. + + - **cursors**: The list of all cursors on this topic. Each subscription in the topic stats has a cursor. + + - **markDeletePosition**: All messages before the markDeletePosition are acknowledged by the subscriber. + + - **readPosition**: The latest position of subscriber for reading message. + + - **waitingReadOp**: This is true when the subscription has read the latest message published to the topic and is waiting for new messages to be published. + + - **pendingReadOps**: The counter for how many outstanding read requests to the BookKeepers in progress. + + - **messagesConsumedCounter**: The number of messages this cursor has acked since this broker loaded this topic. + + - **cursorLedger**: The ledger being used to persistently store the current markDeletePosition. + + - **cursorLedgerLastEntry**: The last entryid used to persistently store the current markDeletePosition. + + - **individuallyDeletedMessages**: If acknowledges are being done out of order, the ranges of messages acknowledged between the markDeletePosition and the read-position shows. + + - **lastLedgerSwitchTimestamp**: The last time the cursor ledger is rolled over. + + - **state**: The state of the cursor ledger: `Open` means you have a cursor ledger for saving updates of the markDeletePosition. + +The following is an example of the detailed statistics of a topic. + +```json + +{ + "entriesAddedCounter":0, + "numberOfEntries":0, + "totalSize":0, + "currentLedgerEntries":0, + "currentLedgerSize":0, + "lastLedgerCreatedTimestamp":"2021-01-22T21:12:14.868+08:00", + "lastLedgerCreationFailureTimestamp":null, + "waitingCursorsCount":0, + "pendingAddEntriesCount":0, + "lastConfirmedEntry":"3:-1", + "state":"LedgerOpened", + "ledgers":[ + { + "ledgerId":3, + "entries":0, + "size":0, + "offloaded":false, + "metadata":null + } + ], + "cursors":{ + "test":{ + "markDeletePosition":"3:-1", + "readPosition":"3:-1", + "waitingReadOp":false, + "pendingReadOps":0, + "messagesConsumedCounter":0, + "cursorLedger":4, + "cursorLedgerLastEntry":1, + "individuallyDeletedMessages":"[]", + "lastLedgerSwitchTimestamp":"2021-01-22T21:12:14.966+08:00", + "state":"Open", + "numberOfEntriesSinceFirstNotAckedMessage":0, + "totalNonContiguousDeletedMessagesRange":0, + "properties":{ + + } + } + }, + "schemaLedgers":[ + { + "ledgerId":1, + "entries":11, + "size":10, + "offloaded":false, + "metadata":null + } + ], + "compactedLedger":{ + "ledgerId":-1, + "entries":-1, + "size":-1, + "offloaded":false, + "metadata":null + } +} + +``` + +To get the internal status of a topic, you can use the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getInternalStats(topic); + +``` + + + + +```` + +### Peek messages + +You can peek a number of messages for a specific subscription of a given topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics peek-messages \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +Message ID: 315674752:0 +Properties: { "X-Pulsar-publish-time" : "2015-07-13 17:40:28.451" } +msg-payload + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/position/:messagePosition|operation/peekNthMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().peekMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Get message by ID + +You can fetch the message with the given ledger ID and entry ID in the following ways. + +````mdx-code-block + + + +```shell + +$ ./bin/pulsar-admin topics get-message-by-id \ + persistent://public/default/my-topic \ + -l 10 -e 0 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/ledger/:ledgerId/entry/:entryId|operation/getMessageById?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +long ledgerId = 10; +long entryId = 10; +admin.topics().getMessageById(topic, ledgerId, entryId); + +``` + + + + +```` + +### Examine messages +You can examine a specific message on a topic by position relative to the earliest or the latest message. + +:::note + +This REST API is only available in 2.8.1 and later versions. + +::: + +````mdx-code-block + + + +```shell + +$ ./bin/pulsar-admin topics examine-messages \ + persistent://public/default/my-topic \ + -i latest -m 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic|operation/examineMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().examineMessage(topic, "latest", 1); + +``` + + + + +```` + +### Skip messages + +You can skip a number of messages for a specific subscription of a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip/:numMessages|operation/skipMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().skipMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Skip all messages + +You can skip all the old messages for a specific subscription of a given topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip-all \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip_all|operation/skipAllMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +admin.topics().skipAllMessages(topic, subName); + +``` + + + + +```` + +### Reset cursor + +You can reset a subscription cursor position back to the position which is recorded X minutes before. It essentially calculates time and position of cursor at X minutes before and resets it at that position. You can reset the cursor in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics reset-cursor \ + --subscription my-subscription --time 10 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/resetcursor/:timestamp|operation/resetCursor?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +long timestamp = 2342343L; +admin.topics().skipAllMessages(topic, subName, timestamp); + +``` + + + + +```` + +### Look up topic's owner broker + +You can locate the owner broker of the given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics lookup \ + persistent://test-tenant/ns1/tp1 \ + + "pulsar://broker1.org.com:4480" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic-domain/:tenant/:namespace/:topic|operation/lookupTopicAsync?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupDestination(topic); + +``` + + + + +```` + +### Get bundle + +You can get the range of the bundle that the given topic belongs to in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics bundle-range \ + persistent://test-tenant/ns1/tp1 \ + + "0x00000000_0xffffffff" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic_domain/:tenant/:namespace/:topic/bundle|operation/getNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().getBundleRange(topic); + +``` + + + + +```` + +### Get subscriptions + +You can check all subscription names for a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics subscriptions \ + persistent://test-tenant/ns1/tp1 \ + + my-subscription + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getSubscriptions(topic); + +``` + + + + +```` + +### Last Message Id + +You can get the last committed message ID for a persistent topic. It is available since 2.3.0 release. + +````mdx-code-block + + + +```shell + +pulsar-admin topics last-message-id topic-name + +``` + + + + +{@inject: endpoint|Get|/admin/v2/:schema/:tenant/:namespace/:topic/lastMessageId|operation/getLastMessageId?version=@pulsar:version_number@} + + + + +```Java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getLastMessage(topic); + +``` + + + + +```` + + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +To get the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +To set the topic-level deduplication snapshot interval, use one of the following methods. + +> **Prerequisite** `brokerDeduplicationEnabled` must be set to `true`. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setDeduplicationSnapshotInterval(topic, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +To remove the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + + +### Configure inactive topic policies + +#### Get inactive topic policies + +To get the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/getInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInactiveTopicPolicies(topic) + +``` + + + + +```` + +#### Set inactive topic policies + +To set the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/setInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setInactiveTopicPolicies(topic, inactiveTopicPolicies) + +``` + + + + +```` + +#### Remove inactive topic policies + +To remove the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/removeInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeInactiveTopicPolicies(topic) + +``` + + + + +```` + + +### Configure offload policies + +#### Get offload policies + +To get the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-offload-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/getOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getOffloadPolicies(topic) + +``` + + + + +```` + +#### Set offload policies + +To set the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-offload-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/setOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setOffloadPolicies(topic, offloadPolicies) + +``` + + + + +```` + +#### Remove offload policies + +To remove the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-offload-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/removeOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeOffloadPolicies(topic) + +``` + + + + +```` + + +## Manage non-partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, delete and check status of non-partitioned topics. + +### Create +Non-partitioned topics must be explicitly created. When creating a new non-partitioned topic, you need to provide a name for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create non-partitioned topics in the following ways. +````mdx-code-block + + + +When you create non-partitioned topics with the [`create`](reference-pulsar-admin.md#create-3) command, you need to specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create \ + persistent://my-tenant/my-namespace/my-topic + +``` + +:::note + +When you create a non-partitioned topic with the suffix '-partition-' followed by numeric value like 'xyz-topic-partition-x' for the topic name, if a partitioned topic with same suffix 'xyz-topic-partition-y' exists, then the numeric value(x) for the non-partitioned topic must be larger than the number of partitions(y) of the partitioned topic. Otherwise, you cannot create such a non-partitioned topic. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createNonPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createNonPartitionedTopic(topicName); + +``` + + + + +```` + +### Delete +You can delete non-partitioned topics in the following ways. +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List + +You can get the list of topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +```json + +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} + +``` + +You can check the current statistics of a given topic and its connected producers and consumers in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/namespace/topic \ + --get-precise-backlog + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getStats(topic, false /* is precise backlog */); + +``` + + + + +```` + +## Manage partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, update, delete and check status of partitioned topics. + +### Create + +Partitioned topics must be explicitly created. When creating a new partitioned topic, you need to provide a name and the number of partitions for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create partitioned topics in the following ways. +````mdx-code-block + + + +When you create partitioned topics with the [`create-partitioned-topic`](reference-pulsar-admin.md#create-partitioned-topic) +command, you need to specify the topic name as an argument and the number of partitions using the `-p` or `--partitions` flag. + +```shell + +$ bin/pulsar-admin topics create-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 + +``` + +:::note + +If a non-partitioned topic with the suffix '-partition-' followed by a numeric value like 'xyz-topic-partition-10', you can not create a partitioned topic with name 'xyz-topic', because the partitions of the partitioned topic could override the existing non-partitioned topic. To create such partitioned topic, you have to delete that non-partitioned topic first. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.topics().createPartitionedTopic(topicName, numPartitions); + +``` + + + + +```` + +### Create missed partitions + +When topic auto-creation is disabled, and you have a partitioned topic without any partitions, you can use the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command to create partitions for the topic. + +````mdx-code-block + + + +You can create missed partitions with the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command and specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create-missed-partitions \ + persistent://my-tenant/my-namespace/my-topic \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createMissedPartitions?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createMissedPartitions(topicName); + +``` + + + + +```` + +### Get metadata + +Partitioned topics are associated with metadata, you can view it as a JSON object. The following metadata field is available. + +Field | Description +:-----|:------- +`partitions` | The number of partitions into which the topic is divided. + +````mdx-code-block + + + +You can check the number of partitions in a partitioned topic with the [`get-partitioned-topic-metadata`](reference-pulsar-admin.md#get-partitioned-topic-metadata) subcommand. + +```shell + +$ pulsar-admin topics get-partitioned-topic-metadata \ + persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPartitionedTopicMetadata(topicName); + +``` + + + + +```` + +### Update + +You can update the number of partitions for an existing partitioned topic *if* the topic is non-global. However, you can only add the partition number. Decrementing the number of partitions would delete the topic, which is not supported in Pulsar. + +Producers and consumers can find the newly created partitions automatically. + +````mdx-code-block + + + +You can update partitioned topics with the [`update-partitioned-topic`](reference-pulsar-admin.md#update-partitioned-topic) command. + +```shell + +$ pulsar-admin topics update-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 8 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:cluster/:namespace/:destination/partitions|operation/updatePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().updatePartitionedTopic(topic, numPartitions); + +``` + + + + +```` + +### Delete +You can delete partitioned topics with the [`delete-partitioned-topic`](reference-pulsar-admin.md#delete-partitioned-topic) command, REST API and Java. + +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:topic/:namespace/:destination/partitions|operation/deletePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List +You can get the list of partitioned topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list-partitioned-topics tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getPartitionedTopicList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getPartitionedTopicList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given partitioned topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +Note that in the subscription JSON object, `chuckedMessageRate` is deprecated. Please use `chunkedMessageRate`. Both will be sent in the JSON for now. + +```json + +{ + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesInCounter" : 270318763, + "msgInCounter" : 252489, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "averageMsgSize" : 1070.926056966454, + "msgChunkPublished" : false, + "storageSize" : 270316646, + "backlogSize" : 200921133, + "publishers" : [ { + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "averageMsgSize" : 1070.3333333333333, + "chunkedMessageRate" : 0.0, + "producerId" : 0 + } ], + "subscriptions" : { + "test" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "msgRateRedeliver" : 0.0, + "chuckedMessageRate" : 0, + "chunkedMessageRate" : 0, + "msgBacklog" : 144318, + "msgBacklogNoDelayed" : 144318, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "msgRateExpired" : 0.0, + "lastExpireTimestamp" : 0, + "lastConsumedFlowTimestamp" : 0, + "lastConsumedTimestamp" : 0, + "lastAckedTimestamp" : 0, + "consumers" : [ ], + "isDurable" : true, + "isReplicated" : false + } + }, + "replication" : { }, + "metadata" : { + "partitions" : 3 + }, + "partitions" : { } +} + +``` + +You can check the current statistics of a given partitioned topic and its connected producers and consumers in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics partitioned-stats \ + persistent://test-tenant/namespace/topic \ + --per-partition + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitioned-stats|operation/getPartitionedStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getPartitionedStats(topic, true /* per partition */, false /* is precise backlog */); + +``` + + + + +```` + +### Internal stats + +You can check the detailed statistics of a topic. The following is an example. For description of each stats, refer to [get internal stats](#get-internal-stats). + +```json + +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} + +``` + +You can get the internal stats for the partitioned topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/namespace/topic + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInternalStats(topic); + +``` + + + + +```` + +## Publish to partitioned topics + +By default, Pulsar topics are served by a single broker, which limits the maximum throughput of a topic. *Partitioned topics* can span multiple brokers and thus allow for higher throughput. + +You can publish to partitioned topics using Pulsar client libraries. When publishing to partitioned topics, you must specify a routing mode. If you do not specify any routing mode when you create a new producer, the round robin routing mode is used. + +### Routing mode + +You can specify the routing mode in the ProducerConfiguration object that you use to configure your producer. The routing mode determines which partition(internal topic) that each message should be published to. + +The following {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} options are available. + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer publishes messages across all partitions in round-robin policy to achieve the maximum throughput. Round-robin is not done per individual message, round-robin is set to the same boundary of batching delay to ensure that batching is effective. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer picks a single partition randomly and publishes all messages into that partition. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. +`CustomPartition` | Use custom message router implementation that is called to determine the partition for a particular message. You can create a custom routing mode by using the Java client and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +The following is an example: + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRoutingMode(MessageRoutingMode.SinglePartition) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### Custom message router + +To use a custom message router, you need to provide an implementation of the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface, which has just one `choosePartition` method: + +```java + +public interface MessageRouter extends Serializable { + int choosePartition(Message msg); +} + +``` + +The following router routes every message to partition 10: + +```java + +public class AlwaysTenRouter implements MessageRouter { + public int choosePartition(Message msg) { + return 10; + } +} + +``` + +With that implementation, you can send + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-cluster-my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRouter(new AlwaysTenRouter()) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### How to choose partitions when using a key +If a message has a key, it supersedes the round robin routing policy. The following example illustrates how to choose the partition when using a key. + +```java + +// If the message has a key, it supersedes the round robin routing policy + if (msg.hasKey()) { + return signSafeMod(hash.makeHash(msg.getKey()), topicMetadata.numPartitions()); + } + + if (isBatchingEnabled) { // if batching is enabled, choose partition on `partitionSwitchMs` boundary. + long currentMs = clock.millis(); + return signSafeMod(currentMs / partitionSwitchMs + startPtnIdx, topicMetadata.numPartitions()); + } else { + return signSafeMod(PARTITION_INDEX_UPDATER.getAndIncrement(this), topicMetadata.numPartitions()); + } + +``` + +## Manage subscriptions +You can use [Pulsar admin API](admin-api-overview.md) to create, check, and delete subscriptions. +### Create subscription +You can create a subscription for a topic using one of the following methods. +````mdx-code-block + + + +```shell + +pulsar-admin topics create-subscription \ +--subscription my-subscription \ +persistent://test-tenant/ns1/tp1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subscription|operation/createSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.topics().createSubscription(topic, subscriptionName, MessageId.latest); + +``` + + + + +```` +### Get subscription +You can check all subscription names for a given topic using one of the following methods. +````mdx-code-block + + + +```shell + +pulsar-admin topics subscriptions \ +persistent://test-tenant/ns1/tp1 \ +my-subscription + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getSubscriptions(topic); + +``` + + + + +```` +### Unsubscribe subscription +When a subscription does not process messages any more, you can unsubscribe it using one of the following methods. +````mdx-code-block + + + +```shell + +pulsar-admin topics unsubscribe \ +--subscription my-subscription \ +persistent://test-tenant/ns1/tp1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/:topic/subscription/:subscription|operation/deleteSubscription?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.topics().deleteSubscription(topic, subscriptionName); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/administration-dashboard.md b/site2/website/versioned_docs/version-2.8.x/administration-dashboard.md new file mode 100644 index 0000000000000..25f976609b40b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-dashboard.md @@ -0,0 +1,76 @@ +--- +id: administration-dashboard +title: Pulsar dashboard +sidebar_label: "Dashboard" +original_id: administration-dashboard +--- + +:::note + +Pulsar dashboard is deprecated. If you want to manage and monitor the stats of your topics, use [Pulsar Manager](administration-pulsar-manager.md). + +::: + +Pulsar dashboard is a web application that enables users to monitor current stats for all [topics](reference-terminology.md#topic) in tabular form. + +The dashboard is a data collector that polls stats from all the brokers in a Pulsar instance (across multiple clusters) and stores all the information in a [PostgreSQL](https://www.postgresql.org/) database. + +You can use the [Django](https://www.djangoproject.com) web app to render the collected data. + +## Install + +The easiest way to use the dashboard is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell + +$ SERVICE_URL=http://broker.example.com:8080/ +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + apachepulsar/pulsar-dashboard:@pulsar:version@ + +``` + +You can find the {@inject: github:Dockerfile:/dashboard/Dockerfile} in the `dashboard` directory and build an image from scratch as well: + +```shell + +$ docker build -t apachepulsar/pulsar-dashboard dashboard + +``` + +If token authentication is enabled: +> Provided token should have super-user access. + +```shell + +$ SERVICE_URL=http://broker.example.com:8080/ +$ JWT_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + -e JWT_TOKEN=$JWT_TOKEN \ + apachepulsar/pulsar-dashboard + +``` + + +You need to specify only one service URL for a Pulsar cluster. Internally, the collector figures out all the existing clusters and the brokers from where it needs to pull the metrics. If you connect the dashboard to Pulsar running in standalone mode, the URL is `http://:8080` by default. `` is the ip address or hostname of the machine running Pulsar standalone. The ip address or hostname should be accessible from the docker instance running dashboard. + +Once the Docker container runs, the web dashboard is accessible via `localhost` or whichever host that Docker uses. + +> The `SERVICE_URL` that the dashboard uses needs to be reachable from inside the Docker container + +If the Pulsar service runs in standalone mode in `localhost`, the `SERVICE_URL` has to +be the IP of the machine. + +Similarly, given the Pulsar standalone advertises itself with localhost by default, you need to +explicitly set the advertise address to the host IP. For example: + +```shell + +$ bin/pulsar standalone --advertised-address 1.2.3.4 + +``` + +### Known issues + +Currently, only Pulsar Token [authentication](security-overview.md#authentication-providers) is supported. diff --git a/site2/website/versioned_docs/version-2.8.x/administration-geo.md b/site2/website/versioned_docs/version-2.8.x/administration-geo.md new file mode 100644 index 0000000000000..29edeac4811ac --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-geo.md @@ -0,0 +1,215 @@ +--- +id: administration-geo +title: Pulsar geo-replication +sidebar_label: "Geo-replication" +original_id: administration-geo +--- + +*Geo-replication* is the replication of persistently stored message data across multiple clusters of a Pulsar instance. + +## How geo-replication works + +The diagram below illustrates the process of geo-replication across Pulsar clusters: + +![Replication Diagram](/assets/geo-replication.png) + +In this diagram, whenever **P1**, **P2**, and **P3** producers publish messages to the **T1** topic on **Cluster-A**, **Cluster-B**, and **Cluster-C** clusters respectively, those messages are instantly replicated across clusters. Once the messages are replicated, **C1** and **C2** consumers can consume those messages from their respective clusters. + +Without geo-replication, **C1** and **C2** consumers are not able to consume messages that **P3** producer publishes. + +## Geo-replication and Pulsar properties + +You must enable geo-replication on a per-tenant basis in Pulsar. You can enable geo-replication between clusters only when a tenant is created that allows access to both clusters. + +Although geo-replication must be enabled between two clusters, actually geo-replication is managed at the namespace level. You must complete the following tasks to enable geo-replication for a namespace: + +* [Enable geo-replication namespaces](#enable-geo-replication-namespaces) +* Configure that namespace to replicate across two or more provisioned clusters + +Any message published on *any* topic in that namespace is replicated to all clusters in the specified set. + +## Local persistence and forwarding + +When messages are produced on a Pulsar topic, messages are first persisted in the local cluster, and then forwarded asynchronously to the remote clusters. + +In normal cases, when connectivity issues are none, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, the network [round-trip time](https://en.wikipedia.org/wiki/Round-trip_delay_time) (RTT) between the remote regions defines end-to-end delivery latency. + +Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (like during a network partition). + +Producers and consumers can publish messages to and consume messages from any cluster in a Pulsar instance. However, subscriptions cannot only be local to the cluster where the subscriptions are created but also can be transferred between clusters after replicated subscription is enabled. Once replicated subscription is enabled, you can keep subscription state in synchronization. Therefore, a topic can be asynchronously replicated across multiple geographical regions. In case of failover, a consumer can restart consuming messages from the failure point in a different cluster. + +In the aforementioned example, the **T1** topic is replicated among three clusters, **Cluster-A**, **Cluster-B**, and **Cluster-C**. + +All messages produced in any of the three clusters are delivered to all subscriptions in other clusters. In this case, **C1** and **C2** consumers receive all messages that **P1**, **P2**, and **P3** producers publish. Ordering is still guaranteed on a per-producer basis. + +## Configure replication + +As stated in [Geo-replication and Pulsar properties](#geo-replication-and-pulsar-properties) section, geo-replication in Pulsar is managed at the [tenant](reference-terminology.md#tenant) level. + +The following example connects three clusters: **us-east**, **us-west**, and **us-cent**. + +### Connect replication clusters + +To replicate data among clusters, you need to configure each cluster to connect to the other. You can use the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool to create a connection. + +**Example** + +Suppose that you have 3 replication clusters: `us-west`, `us-cent`, and `us-east`. + +1. Configure the connection from `us-west` to `us-east`. + + Run the following command on `us-west`. + + ```shell + + $ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-east + + ``` + + :::tip + + - If you want to use a secure connection for a cluster, you can use the flags `--broker-url-secure` and `--url-secure`. For more information, see [pulsar-admin clusters create](https://pulsar.apache.org/tools/pulsar-admin/). + - Different clusters may have different authentications. You can use the authentication flag `--auth-plugin` and `--auth-parameters` together to set cluster authentication, which overrides `brokerClientAuthenticationPlugin` and `brokerClientAuthenticationParameters` if `authenticationEnabled` sets to `true` in `broker.conf` and `standalone.conf`. For more information, see [authentication and authorization](concepts-authentication.md). + + ::: + +2. Configure the connection from `us-west` to `us-cent`. + + Run the following command on `us-west`. + + ```shell + + $ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-cent + + ``` + +3. Run similar commands on `us-east` and `us-cent` to create connections among clusters. + +### Grant permissions to properties + +To replicate to a cluster, the tenant needs permission to use that cluster. You can grant permission to the tenant when you create the tenant or grant later. + +Specify all the intended clusters when you create a tenant: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east,us-cent + +``` + +To update permissions of an existing tenant, use `update` instead of `create`. + +### Enable geo-replication namespaces + +You can create a namespace with the following command sample. + +```shell + +$ bin/pulsar-admin namespaces create my-tenant/my-namespace + +``` + +Initially, the namespace is not assigned to any cluster. You can assign the namespace to clusters using the `set-clusters` subcommand: + +```shell + +$ bin/pulsar-admin namespaces set-clusters my-tenant/my-namespace \ + --clusters us-west,us-east,us-cent + +``` + +You can change the replication clusters for a namespace at any time, without disruption to ongoing traffic. Replication channels are immediately set up or stopped in all clusters as soon as the configuration changes. + +### Use topics with geo-replication + +Once you create a geo-replication namespace, any topics that producers or consumers create within that namespace is replicated across clusters. Typically, each application uses the `serviceUrl` for the local cluster. + +#### Selective replication + +By default, messages are replicated to all clusters configured for the namespace. You can restrict replication selectively by specifying a replication list for a message, and then that message is replicated only to the subset in the replication list. + +The following is an example for the [Java API](client-libraries-java.md). Note the use of the `setReplicationClusters` method when you construct the {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} object: + +```java + +List restrictReplicationTo = Arrays.asList( + "us-west", + "us-east" +); + +Producer producer = client.newProducer() + .topic("some-topic") + .create(); + +producer.newMessage() + .value("my-payload".getBytes()) + .setReplicationClusters(restrictReplicationTo) + .send(); + +``` + +#### Topic stats + +Topic-specific statistics for geo-replication topics are available via the [`pulsar-admin`](reference-pulsar-admin.md) tool and {@inject: rest:REST:/} API: + +```shell + +$ bin/pulsar-admin persistent stats persistent://my-tenant/my-namespace/my-topic + +``` + +Each cluster reports its own local stats, including the incoming and outgoing replication rates and backlogs. + +#### Delete a geo-replication topic + +Given that geo-replication topics exist in multiple regions, directly deleting a geo-replication topic is not possible. Instead, you should rely on automatic topic garbage collection. + +In Pulsar, a topic is automatically deleted when the topic meets the following three conditions: +- no producers or consumers are connected to it; +- no subscriptions to it; +- no more messages are kept for retention. +For geo-replication topics, each region uses a fault-tolerant mechanism to decide when deleting the topic locally is safe. + +You can explicitly disable topic garbage collection by setting `brokerDeleteInactiveTopicsEnabled` to `false` in your [broker configuration](reference-configuration.md#broker). + +To delete a geo-replication topic, close all producers and consumers on the topic, and delete all of its local subscriptions in every replication cluster. When Pulsar determines that no valid subscription for the topic remains across the system, it will garbage collect the topic. + +## Replicated subscriptions + +Pulsar supports replicated subscriptions, so you can keep subscription state in sync, within a sub-second timeframe, in the context of a topic that is being asynchronously replicated across multiple geographical regions. + +In case of failover, a consumer can restart consuming from the failure point in a different cluster. + +### Enable replicated subscription + +Replicated subscription is disabled by default. You can enable replicated subscription when creating a consumer. + +```java + +Consumer consumer = client.newConsumer(Schema.STRING) + .topic("my-topic") + .subscriptionName("my-subscription") + .replicateSubscriptionState(true) + .subscribe(); + +``` + +### Advantages + + * It is easy to implement the logic. + * You can choose to enable or disable replicated subscription. + * When you enable it, the overhead is low, and it is easy to configure. + * When you disable it, the overhead is zero. + +### Limitations + +* When you enable replicated subscription, you're creating a consistent distributed snapshot to establish an association between message ids from different clusters. The snapshots are taken periodically. The default value is `1 second`. It means that a consumer failing over to a different cluster can potentially receive 1 second of duplicates. You can also configure the frequency of the snapshot in the `broker.conf` file. +* Only the base line cursor position is synced in replicated subscriptions while the individual acknowledgments are not synced. This means the messages acknowledged out-of-order could end up getting delivered again, in the case of a cluster failover. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/administration-isolation.md b/site2/website/versioned_docs/version-2.8.x/administration-isolation.md new file mode 100644 index 0000000000000..d2de042a2e741 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-isolation.md @@ -0,0 +1,115 @@ +--- +id: administration-isolation +title: Pulsar isolation +sidebar_label: "Pulsar isolation" +original_id: administration-isolation +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +In an organization, a Pulsar instance provides services to multiple teams. When organizing the resources across multiple teams, you want to make a suitable isolation plan to avoid the resource competition between different teams and applications and provide high-quality messaging service. In this case, you need to take resource isolation into consideration and weigh your intended actions against expected and unexpected consequences. + +To enforce resource isolation, you can use the Pulsar isolation policy, which allows you to allocate resources (**broker** and **bookie**) for the namespace. + +## Broker isolation + +In Pulsar, when namespaces (more specifically, namespace bundles) are assigned dynamically to brokers, the namespace isolation policy limits the set of brokers that can be used for assignment. Before topics are assigned to brokers, you can set the namespace isolation policy with a primary or a secondary regex to select desired brokers. + +You can set a namespace isolation policy for a cluster using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin ns-isolation-policy set options + +``` + +For more information about the command `pulsar-admin ns-isolation-policy set options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin ns-isolation-policy set \ +--auto-failover-policy-type min_available \ +--auto-failover-policy-params min_limit=1,usage_threshold=80 \ +--namespaces my-tenant/my-namespace \ +--primary 10.193.216.* my-cluster policy-name + +``` + + + + +[PUT /admin/v2/namespaces/{tenant}/{namespace}](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/createNamespace) + + + + +For how to set namespace isolation policy using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L251). + + + + +```` + +## Bookie isolation + +A namespace can be isolated into user-defined groups of bookies, which guarantees all the data that belongs to the namespace is stored in desired bookies. The bookie affinity group uses the BookKeeper [rack-aware placement policy](https://bookkeeper.apache.org/docs/latest/api/javadoc/org/apache/bookkeeper/client/EnsemblePlacementPolicy.html) and it is a way to feed rack information which is stored as JSON format in znode. + +You can set a bookie affinity group using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin namespaces set-bookie-affinity-group options + +``` + +For more information about the command `pulsar-admin namespaces set-bookie-affinity-group options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin bookies set-bookie-rack \ +--bookie 127.0.0.1:3181 \ +--hostname 127.0.0.1:3181 \ +--group group-bookie1 \ +--rack rack1 + +bin/pulsar-admin namespaces set-bookie-affinity-group public/default \ +--primary-group group-bookie1 + +``` + + + + +[POST /admin/v2/namespaces/{tenant}/{namespace}/persistence/bookieAffinity](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/setBookieAffinityGroup) + + + + +For how to set bookie affinity group for a namespace using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L1164). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/administration-load-balance.md b/site2/website/versioned_docs/version-2.8.x/administration-load-balance.md new file mode 100644 index 0000000000000..890ebf661624c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-load-balance.md @@ -0,0 +1,256 @@ +--- +id: administration-load-balance +title: Pulsar load balance +sidebar_label: "Load balance" +original_id: administration-load-balance +--- + +## Load balance across Pulsar brokers + +Pulsar is an horizontally scalable messaging system, so the traffic +in a logical cluster must be spread across all the available Pulsar brokers as evenly as possible, which is a core requirement. + +You can use multiple settings and tools to control the traffic distribution which require a bit of context to understand how the traffic is managed in Pulsar. Though, in most cases, the core requirement mentioned above is true out of the box and you should not worry about it. + +## Pulsar load manager architecture + +The following part introduces the basic architecture of the Pulsar load manager. + +### Assign topics to brokers dynamically + +Topics are dynamically assigned to brokers based on the load conditions of all brokers in the cluster. + +When a client starts using new topics that are not assigned to any broker, a process is triggered to choose the best suited broker to acquire ownership of these topics according to the load conditions. + +In case of partitioned topics, different partitions are assigned to different brokers. Here "topic" means either a non-partitioned topic or one partition of a topic. + +The assignment is "dynamic" because the assignment changes quickly. For example, if the broker owning the topic crashes, the topic is reassigned immediately to another broker. Another scenario is that the broker owning the topic becomes overloaded. In this case, the topic is reassigned to a less loaded broker. + +The stateless nature of brokers makes the dynamic assignment possible, so you can quickly expand or shrink the cluster based on usage. + +#### Assignment granularity + +The assignment of topics or partitions to brokers is not done at the topics or partitions level, but done at the Bundle level (a higher level). The reason is to amortize the amount of information that you need to keep track. Based on CPU, memory, traffic load and other indexes, topics are assigned to a particular broker dynamically. + +Instead of individual topic or partition assignment, each broker takes ownership of a subset of the topics for a namespace. This subset is called a "*bundle*" and effectively this subset is a sharding mechanism. + +The namespace is the "administrative" unit: many config knobs or operations are done at the namespace level. + +For assignment, a namespaces is sharded into a list of "bundles", with each bundle comprising +a portion of overall hash range of the namespace. + +Topics are assigned to a particular bundle by taking the hash of the topic name and checking in which +bundle the hash falls into. + +Each bundle is independent of the others and thus is independently assigned to different brokers. + +### Create namespaces and bundles + +When you create a new namespace, the new namespace sets to use the default number of bundles. You can set this in `conf/broker.conf`: + +```properties + +# When a namespace is created without specifying the number of bundle, this +# value will be used as the default +defaultNumberOfNamespaceBundles=4 + +``` + +You can either change the system default, or override it when you create a new namespace: + +```shell + +$ bin/pulsar-admin namespaces create my-tenant/my-namespace --clusters us-west --bundles 16 + +``` + +With this command, you create a namespace with 16 initial bundles. Therefore the topics for this namespaces can immediately be spread across up to 16 brokers. + +In general, if you know the expected traffic and number of topics in advance, you had better start with a reasonable number of bundles instead of waiting for the system to auto-correct the distribution. + +On the same note, it is beneficial to start with more bundles than the number of brokers, because of the hashing nature of the distribution of topics into bundles. For example, for a namespace with 1000 topics, using something like 64 bundles achieves a good distribution of traffic across 16 brokers. + +### Unload topics and bundles + +You can "unload" a topic in Pulsar with admin operation. Unloading means to close the topics, +release ownership and reassign the topics to a new broker, based on current load. + +When unloading happens, the client experiences a small latency blip, typically in the order of tens of milliseconds, while the topic is reassigned. + +Unloading is the mechanism that the load-manager uses to perform the load shedding, but you can also trigger the unloading manually, for example to correct the assignments and redistribute traffic even before having any broker overloaded. + +Unloading a topic has no effect on the assignment, but just closes and reopens the particular topic: + +```shell + +pulsar-admin topics unload persistent://tenant/namespace/topic + +``` + +To unload all topics for a namespace and trigger reassignments: + +```shell + +pulsar-admin namespaces unload tenant/namespace + +``` + +### Split namespace bundles + +Since the load for the topics in a bundle might change over time, or predicting upfront might just be hard, brokers can split bundles into two. The new smaller bundles can be reassigned to different brokers. + +The splitting happens based on some tunable thresholds. Any existing bundle that exceeds any of the threshold is a candidate to be split. By default the newly split bundles are also immediately offloaded to other brokers, to facilitate the traffic distribution. + +```properties + +# enable/disable namespace bundle auto split +loadBalancerAutoBundleSplitEnabled=true + +# enable/disable automatic unloading of split bundles +loadBalancerAutoUnloadSplitBundlesEnabled=true + +# maximum topics in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxTopics=1000 + +# maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxSessions=1000 + +# maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxMsgRate=30000 + +# maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxBandwidthMbytes=100 + +# maximum number of bundles in a namespace (for auto-split) +loadBalancerNamespaceMaximumBundles=128 + +``` + +### Shed load automatically + +The support for automatic load shedding is available in the load manager of Pulsar. This means that whenever the system recognizes a particular broker is overloaded, the system forces some traffic to be reassigned to less loaded brokers. + +When a broker is identified as overloaded, the broker forces to "unload" a subset of the bundles, the +ones with higher traffic, that make up for the overload percentage. + +For example, the default threshold is 85% and if a broker is over quota at 95% CPU usage, then the broker unloads the percent difference plus a 5% margin: `(95% - 85%) + 5% = 15%`. + +Given the selection of bundles to offload is based on traffic (as a proxy measure for cpu, network +and memory), broker unloads bundles for at least 15% of traffic. + +The automatic load shedding is enabled by default and you can disable the automatic load shedding with this setting: + +```properties + +# Enable/disable automatic bundle unloading for load-shedding +loadBalancerSheddingEnabled=true + +``` + +Additional settings that apply to shedding: + +```properties + +# Load shedding interval. Broker periodically checks whether some traffic should be offload from +# some over-loaded broker to other under-loaded brokers +loadBalancerSheddingIntervalMinutes=1 + +# Prevent the same topics to be shed and moved to other brokers more that once within this timeframe +loadBalancerSheddingGracePeriodMinutes=30 + +``` + +#### Broker overload thresholds + +The determinations of when a broker is overloaded is based on threshold of CPU, network and memory usage. Whenever either of those metrics reaches the threshold, the system triggers the shedding (if enabled). + +By default, overload threshold is set at 85%: + +```properties + +# Usage threshold to determine a broker as over-loaded +loadBalancerBrokerOverloadedThresholdPercentage=85 + +``` + +Pulsar gathers the usage stats from the system metrics. + +In case of network utilization, in some cases the network interface speed that Linux reports is +not correct and needs to be manually overridden. This is the case in AWS EC2 instances with 1Gbps +NIC speed for which the OS reports 10Gbps speed. + +Because of the incorrect max speed, the Pulsar load manager might think the broker has not reached the NIC capacity, while in fact the broker already uses all the bandwidth and the traffic is slowed down. + +You can use the following setting to correct the max NIC speed: + +```properties + +# Override the auto-detection of the network interfaces max speed. +# This option is useful in some environments (eg: EC2 VMs) where the max speed +# reported by Linux is not reflecting the real bandwidth available to the broker. +# Since the network usage is employed by the load manager to decide when a broker +# is overloaded, it is important to make sure the info is correct or override it +# with the right value here. The configured value can be a double (eg: 0.8) and that +# can be used to trigger load-shedding even before hitting on NIC limits. +loadBalancerOverrideBrokerNicSpeedGbps= + +``` + +When the value is empty, Pulsar uses the value that the OS reports. + +### Distribute anti-affinity namespaces across failure domains + +When your application has multiple namespaces and you want one of them available all the time to avoid any downtime, you can group these namespaces and distribute them across different [failure domains](reference-terminology.md#failure-domain) and different brokers. Thus, if one of the failure domains is down (due to release rollout or brokers restart), it only disrupts namespaces owned by that specific failure domain and the rest of the namespaces owned by other domains remain available without any impact. + +Such a group of namespaces has anti-affinity to each other, that is, all the namespaces in this group are [anti-affinity namespaces](reference-terminology.md#anti-affinity-namespaces) and are distributed to different failure domains in a load-balanced manner. + +As illustrated in the following figure, Pulsar has 2 failure domains (Domain1 and Domain2) and each domain has 2 brokers in it. You can create an anti-affinity namespace group that has 4 namespaces in it, and all the 4 namespaces have anti-affinity to each other. The load manager tries to distribute namespaces evenly across all the brokers in the same domain. Since each domain has 2 brokers, every broker owns one namespace from this anti-affinity namespace group, and you can see each domain owns 2 namespaces, and each broker owns 1 namespace. + +![Distribute anti-affinity namespaces across failure domains](/assets/anti-affinity-namespaces-across-failure-domains.svg) + +The load manager follows an even distribution policy across failure domains to assign anti-affinity namespaces. The following table outlines the even-distributed assignment sequence illustrated in the above figure. + +| Assignment sequence | Namespace | Failure domain candidates | Broker candidates | Selected broker | +|:---|:------------|:------------------|:------------------------------------|:-----------------| +| 1 | Namespace1 | Domain1, Domain2 | Broker1, Broker2, Broker3, Broker4 | Domain1:Broker1 | +| 2 | Namespace2 | Domain2 | Broker3, Broker4 | Domain2:Broker3 | +| 3 | Namespace3 | Domain1, Domain2 | Broker2, Broker4 | Domain1:Broker2 | +| 4 | Namespace4 | Domain2 | Broker4 | Domain2:Broker4 | + +:::tip + +* Each namespace belongs to only one anti-affinity group. If a namespace with an existing anti-affinity assignment is assigned to another anti-affinity group, the original assignment is dropped. + +* If there are more anti-affinity namespaces than failure domains, the load manager distributes namespaces evenly across all the domains, and also every domain distributes namespaces evenly across all the brokers under that domain. + +::: + +#### Create a failure domain and register brokers + +:::note + +One broker can only be registered to a single failure domain. + +::: + +To create a domain under a specific cluster and register brokers, run the following command: + +```bash + +pulsar-admin clusters create-failure-domain --domain-name --broker-list + +``` + +You can also view, update, and delete domains under a specific cluster. For more information, refer to [Pulsar admin doc](/tools/pulsar-admin/). + +#### Create an anti-affinity namespace group + +An anti-affinity group is created automatically when the first namespace is assigned to the group. To assign a namespace to an anti-affinity group, run the following command. It sets an anti-affinity group name for a namespace. + +```bash + +pulsar-admin namespaces set-anti-affinity-group --group + +``` + +For more information about `anti-affinity-group` related commands, refer to [Pulsar admin doc](/tools/pulsar-admin/). diff --git a/site2/website/versioned_docs/version-2.8.x/administration-proxy.md b/site2/website/versioned_docs/version-2.8.x/administration-proxy.md new file mode 100644 index 0000000000000..c046ed34d46b2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-proxy.md @@ -0,0 +1,86 @@ +--- +id: administration-proxy +title: Pulsar proxy +sidebar_label: "Pulsar proxy" +original_id: administration-proxy +--- + +Pulsar proxy is an optional gateway. Pulsar proxy is used when direction connections between clients and Pulsar brokers are either infeasible or undesirable. For example, when you run Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, you can run Pulsar proxy. + +## Configure the proxy + +Before using the proxy, you need to configure it with the brokers addresses in the cluster. You can configure the proxy to connect directly to service discovery, or specify a broker URL in the configuration. + +### Use service discovery + +Pulsar uses [ZooKeeper](https://zookeeper.apache.org) for service discovery. To connect the proxy to ZooKeeper, specify the following in `conf/proxy.conf`. + +```properties + +zookeeperServers=zk-0,zk-1,zk-2 +configurationStoreServers=zk-0:2184,zk-remote:2184 + +``` + +> To use service discovery, you need to open the network ACLs, so the proxy can connects to the ZooKeeper nodes through the ZooKeeper client port (port `2181`) and the configuration store client port (port `2184`). + +> However, it is not secure to use service discovery. Because if the network ACL is open, when someone compromises a proxy, they have full access to ZooKeeper. + +### Use broker URLs + +It is more secure to specify a URL to connect to the brokers. + +Proxy authorization requires access to ZooKeeper, so if you use these broker URLs to connect to the brokers, you need to disable authorization at the Proxy level. Brokers still authorize requests after the proxy forwards them. + +You can configure the broker URLs in `conf/proxy.conf` as follows. + +```properties + +brokerServiceURL=pulsar://brokers.example.com:6650 +brokerWebServiceURL=http://brokers.example.com:8080 +functionWorkerWebServiceURL=http://function-workers.example.com:8080 + +``` + +If you use TLS, configure the broker URLs in the following way: + +```properties + +brokerServiceURLTLS=pulsar+ssl://brokers.example.com:6651 +brokerWebServiceURLTLS=https://brokers.example.com:8443 +functionWorkerWebServiceURL=https://function-workers.example.com:8443 + +``` + +The hostname in the URLs provided should be a DNS entry which points to multiple brokers or a virtual IP address, which is backed by multiple broker IP addresses, so that the proxy does not lose connectivity to Pulsar cluster if a single broker becomes unavailable. + +The ports to connect to the brokers (6650 and 8080, or in the case of TLS, 6651 and 8443) should be open in the network ACLs. + +Note that if you do not use functions, you do not need to configure `functionWorkerWebServiceURL`. + +## Start the proxy + +To start the proxy: + +```bash + +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy + +``` + +> You can run multiple instances of the Pulsar proxy in a cluster. + +## Stop the proxy + +Pulsar proxy runs in the foreground by default. To stop the proxy, simply stop the process in which the proxy is running. + +## Proxy frontends + +You can run Pulsar proxy behind some kind of load-distributing frontend, such as an [HAProxy](https://www.digitalocean.com/community/tutorials/an-introduction-to-haproxy-and-load-balancing-concepts) load balancer. + +## Use Pulsar clients with the proxy + +Once your Pulsar proxy is up and running, preferably behind a load-distributing [frontend](#proxy-frontends), clients can connect to the proxy via whichever address that the frontend uses. If the address is the DNS address `pulsar.cluster.default`, for example, the connection URL for clients is `pulsar://pulsar.cluster.default:6650`. + +For more information on Proxy configuration, refer to [Pulsar proxy](reference-configuration.md#pulsar-proxy). diff --git a/site2/website/versioned_docs/version-2.8.x/administration-pulsar-manager.md b/site2/website/versioned_docs/version-2.8.x/administration-pulsar-manager.md new file mode 100644 index 0000000000000..0e3800d847f0c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-pulsar-manager.md @@ -0,0 +1,205 @@ +--- +id: administration-pulsar-manager +title: Pulsar Manager +sidebar_label: "Pulsar Manager" +original_id: administration-pulsar-manager +--- + +Pulsar Manager is a web-based GUI management and monitoring tool that helps administrators and users manage and monitor tenants, namespaces, topics, subscriptions, brokers, clusters, and so on, and supports dynamic configuration of multiple environments. + +:::note + +If you monitor your current stats with Pulsar dashboard, you can try to use Pulsar Manager instead. Pulsar dashboard is deprecated. + +::: + +## Install + +The easiest way to use the Pulsar Manager is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell + +docker pull apachepulsar/pulsar-manager:v0.2.0 +docker run -it \ + -p 9527:9527 -p 7750:7750 \ + -e SPRING_CONFIGURATION_FILE=/pulsar-manager/pulsar-manager/application.properties \ + apachepulsar/pulsar-manager:v0.2.0 + +``` + +* `SPRING_CONFIGURATION_FILE`: Default configuration file for spring. + +### Set administrator account and password + + ```shell + + CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token) + curl \ + -H 'X-XSRF-TOKEN: $CSRF_TOKEN' \ + -H 'Cookie: XSRF-TOKEN=$CSRF_TOKEN;' \ + -H "Content-Type: application/json" \ + -X PUT http://localhost:7750/pulsar-manager/users/superuser \ + -d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}' + + ``` + +You can find the docker image in the [Docker Hub](https://github.com/apache/pulsar-manager/tree/master/docker) directory and build an image from the source code as well: + +``` + +git clone https://github.com/apache/pulsar-manager +cd pulsar-manager/front-end +npm install --save +npm run build:prod +cd .. +./gradlew build -x test +cd .. +docker build -f docker/Dockerfile --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` --build-arg VCS_REF=`latest` --build-arg VERSION=`latest` -t apachepulsar/pulsar-manager . + +``` + +### Use custom databases + +If you have a large amount of data, you can use a custom database. The following is an example of PostgreSQL. + +1. Initialize database and table structures using the [file](https://github.com/apache/pulsar-manager/tree/master/src/main/resources/META-INF/sql/postgresql-schema.sql). + +2. Modify the [configuration file](https://github.com/apache/pulsar-manager/blob/master/src/main/resources/application.properties) and add PostgreSQL configuration. + +``` + +spring.datasource.driver-class-name=org.postgresql.Driver +spring.datasource.url=jdbc:postgresql://127.0.0.1:5432/pulsar_manager +spring.datasource.username=postgres +spring.datasource.password=postgres + +``` + +3. Compile to generate a new executable jar package. + +``` + +./gradlew build -x test + +``` + +### Enable JWT authentication + +If you want to turn on JWT authentication, configure the following parameters: + +* `backend.jwt.token`: token for the superuser. You need to configure this parameter during cluster initialization. +* `jwt.broker.token.mode`: multiple modes of generating token, including PUBLIC, PRIVATE, and SECRET. +* `jwt.broker.public.key`: configure this option if you use the PUBLIC mode. +* `jwt.broker.private.key`: configure this option if you use the PRIVATE mode. +* `jwt.broker.secret.key`: configure this option if you use the SECRET mode. + +For more information, see [Token Authentication Admin of Pulsar](http://pulsar.apache.org/docs/en/security-token-admin/). + + +If you want to enable JWT authentication, use one of the following methods. + + +* Method 1: use command-line tool + +``` + +wget https://dist.apache.org/repos/dist/release/pulsar/pulsar-manager/pulsar-manager-0.2.0/apache-pulsar-manager-0.2.0-bin.tar.gz +tar -zxvf apache-pulsar-manager-0.2.0-bin.tar.gz +cd pulsar-manager +tar -zxvf pulsar-manager.tar +cd pulsar-manager +cp -r ../dist ui +./bin/pulsar-manager --redirect.host=http://localhost --redirect.port=9527 insert.stats.interval=600000 --backend.jwt.token=token --jwt.broker.token.mode=PRIVATE --jwt.broker.private.key=file:///path/broker-private.key --jwt.broker.public.key=file:///path/broker-public.key + +``` + +Firstly, [set the administrator account and password](#set-administrator-account-and-password) + +Secondly, log in to Pulsar manager through http://localhost:7750/ui/index.html. + +* Method 2: configure the application.properties file + +``` + +backend.jwt.token=token + +jwt.broker.token.mode=PRIVATE +jwt.broker.public.key=file:///path/broker-public.key +jwt.broker.private.key=file:///path/broker-private.key + +or +jwt.broker.token.mode=SECRET +jwt.broker.secret.key=file:///path/broker-secret.key + +``` + +* Method 3: use Docker and enable token authentication. + +``` + +export JWT_TOKEN="your-token" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -v $PWD:/data apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --secret-key` or `bin/pulsar tokens create --private-key` command. +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* Method 4: use Docker and turn on **token authentication** and **token management** by private key and public key. + +``` + +export JWT_TOKEN="your-token" +export PRIVATE_KEY="file:///pulsar-manager/secret/my-private.key" +export PUBLIC_KEY="file:///pulsar-manager/secret/my-public.key" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e PRIVATE_KEY=$PRIVATE_KEY -e PUBLIC_KEY=$PUBLIC_KEY -v $PWD:/data -v $PWD/secret:/pulsar-manager/secret apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --private-key` command. +* `PRIVATE_KEY`: private key path mounted in container, generated by `bin/pulsar tokens create-key-pair` command. +* `PUBLIC_KEY`: public key path mounted in container, generated by `bin/pulsar tokens create-key-pair` command. +* `$PWD/secret`: the folder where the private key and public key generated by the `bin/pulsar tokens create-key-pair` command are placed locally +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* Method 5: use Docker and turn on **token authentication** and **token management** by secret key. + +``` + +export JWT_TOKEN="your-token" +export SECRET_KEY="file:///pulsar-manager/secret/my-secret.key" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e SECRET_KEY=$SECRET_KEY -v $PWD:/data -v $PWD/secret:/pulsar-manager/secret apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --secret-key` command. +* `SECRET_KEY`: secret key path mounted in container, generated by `bin/pulsar tokens create-secret-key` command. +* `$PWD/secret`: the folder where the secret key generated by the `bin/pulsar tokens create-secret-key` command are placed locally +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* For more information about backend configurations, see [here](https://github.com/apache/pulsar-manager/blob/master/src/README). +* For more information about frontend configurations, see [here](https://github.com/apache/pulsar-manager/tree/master/front-end). + +## Log in + +[Set the administrator account and password](#set-administrator-account-and-password). + +Visit http://localhost:9527 to log in. diff --git a/site2/website/versioned_docs/version-2.8.x/administration-stats.md b/site2/website/versioned_docs/version-2.8.x/administration-stats.md new file mode 100644 index 0000000000000..ac0c03602f36d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-stats.md @@ -0,0 +1,64 @@ +--- +id: administration-stats +title: Pulsar stats +sidebar_label: "Pulsar statistics" +original_id: administration-stats +--- + +## Partitioned topics + +|Stat|Description| +|---|---| +|msgRateIn| The sum of publish rates of all local and replication publishers in messages per second.| +|msgThroughputIn| Same as msgRateIn but in bytes per second instead of messages per second.| +|msgRateOut| The sum of dispatch rates of all local and replication consumers in messages per second.| +|msgThroughputOut| Same as msgRateOut but in bytes per second instead of messages per second.| +|averageMsgSize| Average message size, in bytes, from this publisher within the last interval.| +|storageSize| The sum of storage size of the ledgers for this topic.| +|publishers| The list of all local publishers into the topic. Publishers can be anywhere from zero to thousands.| +|producerId| Internal identifier for this producer on this topic.| +|producerName| Internal identifier for this producer, generated by the client library.| +|address| IP address and source port for the connection of this producer.| +|connectedSince| Timestamp this producer is created or last reconnected.| +|subscriptions| The list of all local subscriptions to the topic.| +|my-subscription| The name of this subscription (client defined).| +|msgBacklog| The count of messages in backlog for this subscription.| +|type| This subscription type.| +|msgRateExpired| The rate at which messages are discarded instead of dispatched from this subscription due to TTL.| +|consumers| The list of connected consumers for this subscription.| +|consumerName| Internal identifier for this consumer, generated by the client library.| +|availablePermits| The number of messages this consumer has space for in the listen queue of client library. A value of 0 means the queue of client library is full and receive() is not being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication| This section gives the stats for cross-colo replication of this topic.| +|replicationBacklog| The outbound replication backlog in messages.| +|connected| Whether the outbound replicator is connected.| +|replicationDelayInSeconds| How long the oldest message has been waiting to be sent through the connection, if connected is true.| +|inboundConnection| The IP and port of the broker in the publisher connection of remote cluster to this broker. | +|inboundConnectedSince| The TCP connection being used to publish messages to the remote cluster. If no local publishers are connected, this connection is automatically closed after a minute.| + + +## Topics + +|Stat|Description| +|---|---| +|entriesAddedCounter| Messages published since this broker loads this topic.| +|numberOfEntries| Total number of messages being tracked.| +|totalSize| Total storage size in bytes of all messages.| +|currentLedgerEntries| Count of messages written to the ledger currently open for writing.| +|currentLedgerSize| Size in bytes of messages written to ledger currently open for writing.| +|lastLedgerCreatedTimestamp| Time when last ledger is created.| +|lastLedgerCreationFailureTimestamp| Time when last ledger is failed.| +|waitingCursorsCount| How many cursors are caught up and waiting for a new message to be published.| +|pendingAddEntriesCount| How many messages have (asynchronous) write requests you are waiting on completion.| +|lastConfirmedEntry| The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger is opened or is being currently opened but has no entries written yet.| +|state| The state of the cursor ledger. Open means you have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers| The ordered list of all ledgers for this topic holding its messages.| +|cursors| The list of all cursors on this topic. Every subscription you saw in the topic stats has one.| +|markDeletePosition| The ack position: the last message the subscriber acknowledges receiving.| +|readPosition| The latest position of subscriber for reading message.| +|waitingReadOp| This is true when the subscription reads the latest message that is published to the topic and waits on new messages to be published.| +|pendingReadOps| The counter for how many outstanding read requests to the BookKeepers you have in progress.| +|messagesConsumedCounter| Number of messages this cursor acks since this broker loads this topic.| +|cursorLedger| The ledger used to persistently store the current markDeletePosition.| +|cursorLedgerLastEntry| The last entryid used to persistently store the current markDeletePosition.| +|individuallyDeletedMessages| If Acks are done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position.| +|lastLedgerSwitchTimestamp| The last time the cursor ledger is rolled over.| diff --git a/site2/website/versioned_docs/version-2.8.x/administration-upgrade.md b/site2/website/versioned_docs/version-2.8.x/administration-upgrade.md new file mode 100644 index 0000000000000..72d136b6460f6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-upgrade.md @@ -0,0 +1,168 @@ +--- +id: administration-upgrade +title: Upgrade Guide +sidebar_label: "Upgrade" +original_id: administration-upgrade +--- + +## Upgrade guidelines + +Apache Pulsar is comprised of multiple components, ZooKeeper, bookies, and brokers. These components are either stateful or stateless. You do not have to upgrade ZooKeeper nodes unless you have special requirement. While you upgrade, you need to pay attention to bookies (stateful), brokers and proxies (stateless). + +The following are some guidelines on upgrading a Pulsar cluster. Read the guidelines before upgrading. + +- Backup all your configuration files before upgrading. +- Read guide entirely, make a plan, and then execute the plan. When you make upgrade plan, you need to take your specific requirements and environment into consideration. +- Pay attention to the upgrading order of components. In general, you do not need to upgrade your ZooKeeper or configuration store cluster (the global ZooKeeper cluster). You need to upgrade bookies first, and then upgrade brokers, proxies, and your clients. +- If `autorecovery` is enabled, you need to disable `autorecovery` in the upgrade process, and re-enable it after completing the process. +- Read the release notes carefully for each release. Release notes contain features, configuration changes that might impact your upgrade. +- Upgrade a small subset of nodes of each type to canary test the new version before upgrading all nodes of that type in the cluster. When you have upgraded the canary nodes, run for a while to ensure that they work correctly. +- Upgrade one data center to verify new version before upgrading all data centers if your cluster runs in multi-cluster replicated mode. + +> Note: Currently, Apache Pulsar is compatible between versions. + +## Upgrade sequence + +To upgrade an Apache Pulsar cluster, follow the upgrade sequence. + +1. Upgrade ZooKeeper (optional) +- Canary test: test an upgraded version in one or a small set of ZooKeeper nodes. +- Rolling upgrade: rollout the upgraded version to all ZooKeeper servers incrementally, one at a time. Monitor your dashboard during the whole rolling upgrade process. +2. Upgrade bookies +- Canary test: test an upgraded version in one or a small set of bookies. +- Rolling upgrade: + - a. Disable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -disable + + ``` + + + - b. Rollout the upgraded version to all bookies in the cluster after you determine that a version is safe after canary. + - c. After you upgrade all bookies, re-enable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -enable + + ``` + +3. Upgrade brokers +- Canary test: test an upgraded version in one or a small set of brokers. +- Rolling upgrade: rollout the upgraded version to all brokers in the cluster after you determine that a version is safe after canary. +4. Upgrade proxies +- Canary test: test an upgraded version in one or a small set of proxies. +- Rolling upgrade: rollout the upgraded version to all proxies in the cluster after you determine that a version is safe after canary. + +## Upgrade ZooKeeper (optional) +While you upgrade ZooKeeper servers, you can do canary test first, and then upgrade all ZooKeeper servers in the cluster. + +### Canary test + +You can test an upgraded version in one of ZooKeeper servers before upgrading all ZooKeeper servers in your cluster. + +To upgrade ZooKeeper server to a new version, complete the following steps: + +1. Stop a ZooKeeper server. +2. Upgrade the binary and configuration files. +3. Start the ZooKeeper server with the new binary files. +4. Use `pulsar zookeeper-shell` to connect to the newly upgraded ZooKeeper server and run a few commands to verify if it works as expected. +5. Run the ZooKeeper server for a few days, observe and make sure the ZooKeeper cluster runs well. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic ZooKeeper node, revert the binary and configuration, and restart the ZooKeeper with the reverted binary. + +### Upgrade all ZooKeeper servers + +After canary test to upgrade one ZooKeeper in your cluster, you can upgrade all ZooKeeper servers in your cluster. + +You can upgrade all ZooKeeper servers one by one by following steps in canary test. + +## Upgrade bookies + +While you upgrade bookies, you can do canary test first, and then upgrade all bookies in the cluster. +For more details, you can read Apache BookKeeper [Upgrade guide](http://bookkeeper.apache.org/docs/latest/admin/upgrade). + +### Canary test + +You can test an upgraded version in one or a small set of bookies before upgrading all bookies in your cluster. + +To upgrade bookie to a new version, complete the following steps: + +1. Stop a bookie. +2. Upgrade the binary and configuration files. +3. Start the bookie in `ReadOnly` mode to verify if the bookie of this new version runs well for read workload. + + ```shell + + bin/pulsar bookie --readOnly + + ``` + +4. When the bookie runs successfully in `ReadOnly` mode, stop the bookie and restart it in `Write/Read` mode. + + ```shell + + bin/pulsar bookie + + ``` + +5. Observe and make sure the cluster serves both write and read traffic. + +#### Canary rollback + +If issues occur during the canary test, you can shut down the problematic bookie node. Other bookies in the cluster replaces this problematic bookie node with autorecovery. + +### Upgrade all bookies + +After canary test to upgrade some bookies in your cluster, you can upgrade all bookies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, upgrade one bookie at a time. In a downtime upgrade scenario, shut down the entire cluster, upgrade each bookie, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each bookie. + +1. Stop the bookie. +2. Upgrade the software (either new binary or new configuration files). +2. Start the bookie. + +> **Advanced operations** +> When you upgrade a large BookKeeper cluster in a rolling upgrade scenario, upgrading one bookie at a time is slow. If you configure rack-aware or region-aware placement policy, you can upgrade bookies rack by rack or region by region, which speeds up the whole upgrade process. + +## Upgrade brokers and proxies + +The upgrade procedure for brokers and proxies is the same. Brokers and proxies are `stateless`, so upgrading the two services is easy. + +### Canary test + +You can test an upgraded version in one or a small set of nodes before upgrading all nodes in your cluster. + +To upgrade to a new version, complete the following steps: + +1. Stop a broker (or proxy). +2. Upgrade the binary and configuration file. +3. Start a broker (or proxy). + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic broker (or proxy) node. Revert to the old version and restart the broker (or proxy). + +### Upgrade all brokers or proxies + +After canary test to upgrade some brokers or proxies in your cluster, you can upgrade all brokers or proxies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, you can upgrade one broker or one proxy at a time if the size of the cluster is small. If your cluster is large, you can upgrade brokers or proxies in batches. When you upgrade a batch of brokers or proxies, make sure the remaining brokers and proxies in the cluster have enough capacity to handle the traffic during upgrade. + +In a downtime upgrade scenario, shut down the entire cluster, upgrade each broker or proxy, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each broker or proxy. + +1. Stop the broker or proxy. +2. Upgrade the software (either new binary or new configuration files). +3. Start the broker or proxy. diff --git a/site2/website/versioned_docs/version-2.8.x/administration-zk-bk.md b/site2/website/versioned_docs/version-2.8.x/administration-zk-bk.md new file mode 100644 index 0000000000000..2c080123ca81d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/administration-zk-bk.md @@ -0,0 +1,386 @@ +--- +id: administration-zk-bk +title: ZooKeeper and BookKeeper administration +sidebar_label: "ZooKeeper and BookKeeper" +original_id: administration-zk-bk +--- + +Pulsar relies on two external systems for essential tasks: + +* [ZooKeeper](https://zookeeper.apache.org/) is responsible for a wide variety of configuration-related and coordination-related tasks. +* [BookKeeper](http://bookkeeper.apache.org/) is responsible for [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data. + +ZooKeeper and BookKeeper are both open-source [Apache](https://www.apache.org/) projects. + +> Skip to the [How Pulsar uses ZooKeeper and BookKeeper](#how-pulsar-uses-zookeeper-and-bookkeeper) section below for a more schematic explanation of the role of these two systems in Pulsar. + + +## ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +To deploy a Pulsar instance, you need to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the node ID in `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you can set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy configuration store + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you need to stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, you can assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also you can assume, each cluster has its own local ZK servers named such as + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +### ZooKeeper configuration + +In Pulsar, ZooKeeper configuration is handled by two separate configuration files in the `conf` directory of your Pulsar installation: `conf/zookeeper.conf` for [local ZooKeeper](#local-zookeeper) and `conf/global-zookeeper.conf` for [configuration store](#configuration-store). + +#### Local ZooKeeper + +The [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file handles the configuration for local ZooKeeper. The table below shows the available parameters: + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper stores in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server listens for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, which triggers the ZooKeeper database purge task. Setting to a non-zero number enables auto purge; setting to 0 disables. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + +#### Configuration Store + +The [`conf/global-zookeeper.conf`](reference-configuration.md#configuration-store) file handles the configuration for configuration store. The table below shows the available parameters: + + +## BookKeeper + +BookKeeper stores all durable messages in Pulsar. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) WAL system that guarantees read consistency of independent message logs calls ledgers. Individual BookKeeper servers are also called *bookies*. + +> To manage message persistence, retention, and expiry in Pulsar, refer to [cookbook](cookbooks-retention-expiry.md). + +### Hardware requirements + +Bookie hosts store message data on disk. To provide optimal performance, ensure that the bookies have a suitable hardware configuration. The following are two key dimensions of bookie hardware capacity: + +- Disk I/O capacity read/write +- Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker by default. To ensure low write latency, BookKeeper is designed to use multiple devices: + +- A **journal** to ensure durability. For sequential writes, it is critical to have fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +- A **ledger storage device** stores data. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + +### Configure BookKeeper + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. When you configure each bookie, ensure that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for local ZooKeeper of the Pulsar cluster. + +The minimum configuration changes required in `conf/bookkeeper.conf` are as follows: + +:::note + +Set `journalDirectory` and `ledgerDirectories` carefully. It is difficilt to change them later. + +::: + +```properties + +# Change to point to journal disk mount point +journalDirectory=data/bookkeeper/journal + +# Point to ledger storage disk mount point +ledgerDirectories=data/bookkeeper/ledgers + +# Point to local ZK quorum +zkServers=zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181 + +#It is recommended to set this parameter. Otherwise, BookKeeper can't start normally in certain environments (for example, Huawei Cloud). +advertisedAddress= + +``` + +To change the ZooKeeper root path that BookKeeper uses, use `zkLedgersRootPath=/MY-PREFIX/ledgers` instead of `zkServers=localhost:2181/MY-PREFIX`. + +> For more information about BookKeeper, refer to the official [BookKeeper docs](http://bookkeeper.apache.org). + +### Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. Each Pulsar broker has its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Start bookies manually + +You can start a bookie in the foreground or as a background daemon. + +To start a bookie in the foreground, use the [`bookkeeper`](reference-cli-tools.md#bookkeeper) CLI tool: + +```bash + +$ bin/bookkeeper bookie + +``` + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify whether the bookie works properly with the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell + +$ bin/bookkeeper shell bookiesanity + +``` + +When you use this command, you create a new ledger on the local bookie, write a few entries, read them back and finally delete the ledger. + +### Decommission bookies cleanly + +Before you decommission a bookie, you need to check your environment and meet the following requirements. + +1. Ensure the state of your cluster supports decommissioning the target bookie. Check if `EnsembleSize >= Write Quorum >= Ack Quorum` is `true` with one less bookie. + +2. Ensure the target bookie is listed after using the `listbookies` command. + +3. Ensure that no other process is ongoing (upgrade etc). + +And then you can decommission bookies safely. To decommission bookies, complete the following steps. + +1. Log in to the bookie node, check if there are underreplicated ledgers. The decommission command force to replicate the underreplicated ledgers. +`$ bin/bookkeeper shell listunderreplicated` + +2. Stop the bookie by killing the bookie process. Make sure that no liveness/readiness probes setup for the bookies to spin them back up if you deploy it in a Kubernetes environment. + +3. Run the decommission command. + - If you have logged in to the node to be decommissioned, you do not need to provide `-bookieid`. + - If you are running the decommission command for the target bookie node from another bookie node, you should mention the target bookie ID in the arguments for `-bookieid` + `$ bin/bookkeeper shell decommissionbookie` + or + `$ bin/bookkeeper shell decommissionbookie -bookieid ` + +4. Validate that no ledgers are on the decommissioned bookie. +`$ bin/bookkeeper shell listledgers -bookieid ` + +You can run the following command to check if the bookie you have decommissioned is listed in the bookies list: + +```bash + +./bookkeeper shell listbookies -rw -h +./bookkeeper shell listbookies -ro -h + +``` + +## BookKeeper persistence policies + +In Pulsar, you can set *persistence policies* at the namespace level, which determines how BookKeeper handles persistent storage of messages. Policies determine four things: + +* The number of acks (guaranteed copies) to wait for each ledger entry. +* The number of bookies to use for a topic. +* The number of writes to make for each ledger entry. +* The throttling rate for mark-delete operations. + +### Set persistence policies + +You can set persistence policies for BookKeeper at the [namespace](reference-terminology.md#namespace) level. + +#### Pulsar-admin + +Use the [`set-persistence`](reference-pulsar-admin.md#namespaces-set-persistence) subcommand and specify a namespace as well as any policies that you want to apply. The available flags are: + +Flag | Description | Default +:----|:------------|:------- +`-a`, `--bookkeeper-ack-quorum` | The number of acks (guaranteed copies) to wait on for each entry | 0 +`-e`, `--bookkeeper-ensemble` | The number of [bookies](reference-terminology.md#bookie) to use for topics in the namespace | 0 +`-w`, `--bookkeeper-write-quorum` | The number of writes to make for each entry | 0 +`-r`, `--ml-mark-delete-max-rate` | Throttling rate for mark-delete operations (0 means no throttle) | 0 + +The following is an example: + +```shell + +$ pulsar-admin namespaces set-persistence my-tenant/my-ns \ + --bookkeeper-ack-quorum 3 \ + --bookkeeper-ensemble 2 + +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +int bkEnsemble = 2; +int bkQuorum = 3; +int bkAckQuorum = 2; +double markDeleteRate = 0.7; +PersistencePolicies policies = + new PersistencePolicies(ensemble, quorum, ackQuorum, markDeleteRate); +admin.namespaces().setPersistence(namespace, policies); + +``` + +### List persistence policies + +You can see which persistence policy currently applies to a namespace. + +#### Pulsar-admin + +Use the [`get-persistence`](reference-pulsar-admin.md#namespaces-get-persistence) subcommand and specify the namespace. + +The following is an example: + +```shell + +$ pulsar-admin namespaces get-persistence my-tenant/my-ns +{ + "bookkeeperEnsemble": 1, + "bookkeeperWriteQuorum": 1, + "bookkeeperAckQuorum", 1, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +PersistencePolicies policies = admin.namespaces().getPersistence(namespace); + +``` + +## How Pulsar uses ZooKeeper and BookKeeper + +This diagram illustrates the role of ZooKeeper and BookKeeper in a Pulsar cluster: + +![ZooKeeper and BookKeeper](/assets/pulsar-system-architecture.png) + +Each Pulsar cluster consists of one or more message brokers. Each broker relies on an ensemble of bookies. diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-cgo.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-cgo.md new file mode 100644 index 0000000000000..f352f942b7714 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-cgo.md @@ -0,0 +1,579 @@ +--- +id: client-libraries-cgo +title: Pulsar CGo client +sidebar_label: "CGo(deprecated)" +original_id: client-libraries-cgo +--- + +You can use Pulsar Go client to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Go client are thread-safe. + +Currently, the following Go clients are maintained in two repositories. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| CGo | [pulsar-client-go](https://github.com/apache/pulsar/tree/master/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | CGo client that depends on C++ client library | +| Go | [pulsar-client-go](https://github.com/apache/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar/pulsar-client-go/pulsar). + +## Installation + +### Requirements + +Pulsar Go client library is based on the C++ client library. Follow +the instructions for [C++ library](client-libraries-cpp.md) for installing the binaries through [RPM](client-libraries-cpp.md#rpm), [Deb](client-libraries-cpp.md#deb) or [Homebrew packages](client-libraries-cpp.md#macos). + +### Install go package + +> **Compatibility Warning** +> The version number of the Go client **must match** the version number of the Pulsar C++ client library. + +You can install the `pulsar` library locally using `go get`. Note that `go get` doesn't support fetching a specific tag - it will always pull in master's version of the Go client. You'll need a C++ client library that matches master. + +```bash + +$ go get -u github.com/apache/pulsar/pulsar-client-go/pulsar + +``` + +Or you can use [dep](https://github.com/golang/dep) for managing the dependencies. + +```bash + +$ dep ensure -add github.com/apache/pulsar/pulsar-client-go/pulsar@v@pulsar:version@ + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar/pulsar-client-go/pulsar" + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "runtime" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeoutSeconds: 5, + MessageListenerThreads: runtime.NumCPU(), + }) + + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } +} + +``` + +The following configurable parameters are available for Pulsar clients: + +Parameter | Description | Default +:---------|:------------|:------- +`URL` | The connection URL for the Pulsar cluster. See [above](#urls) for more info | +`IOThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker) | 1 +`OperationTimeoutSeconds` | The timeout for some Go client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 +`MessageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)) | 1 +`ConcurrentLookupRequests` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 5000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 5000 +`Logger` | A custom logger implementation for the client (as a function that takes a log level, file path, line number, and message). All info, warn, and error messages will be routed to this function. | `nil` +`TLSTrustCertsFilePath` | The file path for the trusted TLS certificate | +`TLSAllowInsecureConnection` | Whether the client accepts untrusted TLS certificates from the broker | `false` +`Authentication` | Configure the authentication provider. (default: no authentication). Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | `nil` +`StatsIntervalInSeconds` | The interval (in seconds) at which client stats are published | 60 + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatalf("Could not instantiate Pulsar producer: %v", err) +} + +defer producer.Close() + +msg := pulsar.ProducerMessage{ + Payload: []byte("Hello, Pulsar"), +} + +if err := producer.Send(context.Background(), msg); err != nil { + log.Fatalf("Producer could not send message: %v", err) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar producer, the operation will block (waiting on a go channel) until either a producer is successfully created or an error is thrown. + + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | `error` +`SendAndGetMsgID(context.Context, ProducerMessage)`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | (MessageID, error) +`SendAsync(context.Context, ProducerMessage, func(ProducerMessage, error))` | Publishes a [message](#messages) to the producer's topic asynchronously. The third argument is a callback function that specifies what happens either when the message is acknowledged or an error is thrown. | +`SendAndGetMsgIDAsync(context.Context, ProducerMessage, func(MessageID, error))`| Send a message in asynchronous mode. The callback will report back the message being published and the eventual error in publishing | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | `error` +`Schema()` | | Schema + +Here's a more involved example usage of a producer: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client to instantiate a producer + producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", + }) + + if err != nil { log.Fatal(err) } + + ctx := context.Background() + + // Send 10 messages synchronously and 10 messages asynchronously + for i := 0; i < 10; i++ { + // Create a message + msg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("message-%d", i)), + } + + // Attempt to send the message + if err := producer.Send(ctx, msg); err != nil { + log.Fatal(err) + } + + // Create a different message to send asynchronously + asyncMsg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("async-message-%d", i)), + } + + // Attempt to send the message asynchronously and handle the response + producer.SendAsync(ctx, asyncMsg, func(msg pulsar.ProducerMessage, err error) { + if err != nil { log.Fatal(err) } + + fmt.Printf("the %s successfully published", string(msg.Payload)) + }) + } +} + +``` + +### Producer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer will publish messages | +`Name` | A name for the producer. If you don't explicitly assign a name, Pulsar will automatically generate a globally unique name that you can access later using the `Name()` method. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | +`Properties`| Attach a set of application defined properties to the producer. This properties will be visible in the topic stats | +`SendTimeout` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `SendTimeout` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30 seconds +`MaxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `Send` and `SendAsync` methods will fail *unless* `BlockIfQueueFull` is set to `true`. | +`MaxPendingMessagesAcrossPartitions` | Set the number of max pending messages across all the partitions. This setting will be used to lower the max pending messages for each partition `MaxPendingMessages(int)`, if the total exceeds the configured value.| +`BlockIfQueueFull` | If set to `true`, the producer's `Send` and `SendAsync` methods will block when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `MaxPendingMessages` parameter); if set to `false` (the default), `Send` and `SendAsync` operations will fail and throw a `ProducerQueueIsFullError` when the queue is full. | `false` +`MessageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`pulsar.RoundRobinDistribution`, the default), publishing all messages to a single partition (`pulsar.UseSinglePartition`), or a custom partitioning scheme (`pulsar.CustomPartition`). | `pulsar.RoundRobinDistribution` +`HashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `pulsar.JavaStringHash` (the equivalent of `String.hashCode()` in Java), `pulsar.Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `pulsar.BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library) | `pulsar.JavaStringHash` +`CompressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), [`ZLIB`](https://zlib.net/), [`ZSTD`](https://facebook.github.io/zstd/) and [`SNAPPY`](https://google.github.io/snappy/). | No compression +`MessageRouter` | By default, Pulsar uses a round-robin routing scheme for [partitioned topics](cookbooks-partitioned.md). The `MessageRouter` parameter enables you to specify custom routing logic via a function that takes the Pulsar message and topic metadata as an argument and returns an integer (where the ), i.e. a function signature of `func(Message, TopicMetadata) int`. | +`Batching` | Control whether automatic batching of messages is enabled for the producer. | false +`BatchingMaxPublishDelay` | Set the time period within which the messages sent will be batched (default: 1ms) if batch messages are enabled. If set to a non zero value, messages will be queued until this time interval or until | 1ms +`BatchingMaxMessages` | Set the maximum number of messages permitted in a batch. (default: 1000) If set to a value greater than 1, messages will be queued until this threshold is reached or batch interval has elapsed | 1000 + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +msgChannel := make(chan pulsar.ConsumerMessage) + +consumerOpts := pulsar.ConsumerOptions{ + Topic: "my-topic", + SubscriptionName: "my-subscription-1", + Type: pulsar.Exclusive, + MessageChannel: msgChannel, +} + +consumer, err := client.Subscribe(consumerOpts) + +if err != nil { + log.Fatalf("Could not establish subscription: %v", err) +} + +defer consumer.Close() + +for cm := range msgChannel { + msg := cm.Message + + fmt.Printf("Message ID: %s", msg.ID()) + fmt.Printf("Message value: %s", string(msg.Payload())) + + consumer.Ack(msg) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar consumer, the operation will block (on a go channel) until either a producer is successfully created or an error is thrown. + + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the consumer's [topic](reference-terminology.md#topic) | `string` +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | `error` +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | `error` +`AckCumulative(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `AckCumulative` method will block until the ack has been sent to the broker. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can only be used with a [shared](concepts-messaging.md#shared) subscription type. | `error` +`AckCumulativeID(MessageID)` |Ack the reception of all the messages in the stream up to (and including) the provided message. This method will block until the acknowledge has been sent to the broker. After that, the messages will not be re-delivered to this consumer. | error +`Nack(Message)` | Acknowledge the failure to process a single message. | `error` +`NackID(MessageID)` | Acknowledge the failure to process a single message. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | `error` +`RedeliverUnackedMessages()` | Redelivers *all* unacknowledged messages on the topic. In [failover](concepts-messaging.md#failover) mode, this request is ignored if the consumer isn't active on the specified topic; in [shared](concepts-messaging.md#shared) mode, redelivered messages are distributed across all consumers connected to the topic. **Note**: this is a *non-blocking* operation that doesn't throw an error. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | error + +#### Receive example + +Here's an example usage of a Go consumer that uses the `Receive()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client object to instantiate a consumer + consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-golang-topic", + SubscriptionName: "sub-1", + Type: pulsar.Exclusive, + }) + + if err != nil { log.Fatal(err) } + + defer consumer.Close() + + ctx := context.Background() + + // Listen indefinitely on the topic + for { + msg, err := consumer.Receive(ctx) + if err != nil { log.Fatal(err) } + + // Do something with the message + err = processMessage(msg) + + if err == nil { + // Message processed successfully + consumer.Ack(msg) + } else { + // Failed to process messages + consumer.Nack(msg) + } + } +} + +``` + +### Consumer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the consumer will establish a subscription and listen for messages | +`Topics` | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing | +`TopicsPattern` | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | +`SubscriptionName` | The subscription name for this consumer | +`Properties` | Attach a set of application defined properties to the consumer. This properties will be visible in the topic stats| +`Name` | The name of the consumer | +`AckTimeout` | Set the timeout for unacked messages | 0 +`NackRedeliveryDelay` | The delay after which to redeliver the messages that failed to be processed. Default is 1min. (See `Consumer.Nack()`) | 1 minute +`Type` | Available options are `Exclusive`, `Shared`, and `Failover` | `Exclusive` +`SubscriptionInitPos` | InitialPosition at which the cursor will be set when subscribe | Latest +`MessageChannel` | The Go channel used by the consumer. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `Receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 +`MaxTotalReceiverQueueSizeAcrossPartitions` |Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value | 50000 +`ReadCompacted` | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the consumer will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal. | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageId: pulsar.LatestMessage, +}) + +``` + +> **Blocking operation** +> When you create a new Pulsar reader, the operation will block (on a go channel) until either a reader is successfully created or an error is thrown. + + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` + +#### "Next" example + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatalf("Could not create client: %v", err) } + + // Use the client to instantiate a reader + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.EarliestMessage, + }) + + if err != nil { log.Fatalf("Could not create reader: %v", err) } + + defer reader.Close() + + ctx := context.Background() + + // Listen on the topic for incoming messages + for { + msg, err := reader.Next(ctx) + if err != nil { log.Fatalf("Error reading from topic: %v", err) } + + // Process the message + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: DeserializeMessageID(lastSavedId), +}) + +``` + +### Reader configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader will establish a subscription and listen for messages +`Name` | The name of the reader +`StartMessageID` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `pulsar.EarliestMessage` (the earliest available message on the topic), `pulsar.LatestMessage` (the latest available message on the topic), or a `MessageID` object for a position that isn't earliest or latest. | +`MessageChannel` | The Go channel used by the reader. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `Next`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 +`SubscriptionRolePrefix` | The subscription role prefix. | `reader` +`ReadCompacted` | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the reader will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal.| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## Schema + +This example shows how to create a producer and consumer with schema. + +```go + +var exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +jsonSchema := NewJsonSchema(exampleSchemaDef, nil) +// create producer +producer, err := client.CreateProducerWithSchema(ProducerOptions{ + Topic: "jsonTopic", +}, jsonSchema) +err = producer.Send(context.Background(), ProducerMessage{ + Value: &testJson{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() +//create consumer +var s testJson +consumerJS := NewJsonSchema(exampleSchemaDef, nil) +consumer, err := client.SubscribeWithSchema(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-2", +}, consumerJS) +if err != nil { + log.Fatal(err) +} +msg, err := consumer.Receive(context.Background()) +if err != nil { + log.Fatal(err) +} +err = msg.GetValue(&s) +if err != nil { + log.Fatal(err) +} +fmt.Println(s.ID) // output: 100 +fmt.Println(s.Name) // output: pulsar +defer consumer.Close() + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-cpp.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-cpp.md new file mode 100644 index 0000000000000..49cbbf6738aad --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-cpp.md @@ -0,0 +1,405 @@ +--- +id: client-libraries-cpp +title: Pulsar C++ client +sidebar_label: "C++" +original_id: client-libraries-cpp +--- + +You can use Pulsar C++ client to create Pulsar producers and consumers in C++. + +All the methods in producer, consumer, and reader of a C++ client are thread-safe. + +## Supported platforms + +Pulsar C++ client is supported on **Linux** and **MacOS** platforms. + +[Doxygen](http://www.doxygen.nl/)-generated API docs for the C++ client are available [here](/api/cpp). + +## System requirements + +You need to install the following components before using the C++ client: + +* [CMake](https://cmake.org/) +* [Boost](http://www.boost.org/) +* [Protocol Buffers](https://developers.google.com/protocol-buffers/) 2.6 +* [libcurl](https://curl.haxx.se/libcurl/) +* [Google Test](https://github.com/google/googletest) + +## Linux + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +$ apt-get install cmake libssl-dev libcurl4-openssl-dev liblog4cxx-dev \ + libprotobuf-dev protobuf-compiler libboost-all-dev google-mock libgtest-dev libjsoncpp-dev + +``` + +3. Compile and install [Google Test](https://github.com/google/googletest). + +```shell + +# libgtest-dev version is 1.18.0 or above +$ cd /usr/src/googletest +$ sudo cmake . +$ sudo make +$ sudo cp ./googlemock/libgmock.a ./googlemock/gtest/libgtest.a /usr/lib/ + +# less than 1.18.0 +$ cd /usr/src/gtest +$ sudo cmake . +$ sudo make +$ sudo cp libgtest.a /usr/lib + +$ cd /usr/src/gmock +$ sudo cmake . +$ sudo make +$ sudo cp libgmock.a /usr/lib + +``` + +4. Compile the Pulsar client library for C++ inside the Pulsar repository. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +After you install the components successfully, the files `libpulsar.so` and `libpulsar.a` are in the `lib` folder of the repository. The tools `perfProducer` and `perfConsumer` are in the `perf` directory. + +### Install Dependencies + +> Since 2.1.0 release, Pulsar ships pre-built RPM and Debian packages. You can download and install those packages directly. + +After you download and install RPM or DEB, the `libpulsar.so`, `libpulsarnossl.so`, `libpulsar.a`, and `libpulsarwithdeps.a` libraries are in your `/usr/lib` directory. + +By default, they are built in code path `${PULSAR_HOME}/pulsar-client-cpp`. You can build with the command below. + + `cmake . -DBUILD_TESTS=OFF -DLINK_STATIC=ON && make pulsarShared pulsarSharedNossl pulsarStatic pulsarStaticWithDeps -j 3`. + +These libraries rely on some other libraries. If you want to get detailed version of dependencies, see [RPM](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/rpm/Dockerfile) or [DEB](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/deb/Dockerfile) files. + +1. `libpulsar.so` is a shared library, containing statically linked `boost` and `openssl`. It also dynamically links all other necessary libraries. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.so -I/usr/local/ssl/include + +``` + +2. `libpulsarnossl.so` is a shared library, similar to `libpulsar.so` except that the libraries `openssl` and `crypto` are dynamically linked. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarnossl.so -lssl -lcrypto -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +3. `libpulsar.a` is a static library. You need to load dependencies before using this library. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib -lboost_system -lboost_regex -lcurl -lprotobuf -lzstd -lz + +``` + +4. `libpulsarwithdeps.a` is a static library, based on `libpulsar.a`. It is archived in the dependencies of `libboost_regex`, `libboost_system`, `libcurl`, `libprotobuf`, `libzstd` and `libz`. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarwithdeps.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +The `libpulsarwithdeps.a` does not include library openssl related libraries `libssl` and `libcrypto`, because these two libraries are related to security. It is more reasonable and easier to use the versions provided by the local system to handle security issues and upgrade libraries. + +### Install RPM + +1. Download a RPM package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:dist_rpm:client@) | [asc](@pulsar:dist_rpm:client@.asc), [sha512](@pulsar:dist_rpm:client@.sha512) | +| [client-debuginfo](@pulsar:dist_rpm:client-debuginfo@) | [asc](@pulsar:dist_rpm:client-debuginfo@.asc), [sha512](@pulsar:dist_rpm:client-debuginfo@.sha512) | +| [client-devel](@pulsar:dist_rpm:client-devel@) | [asc](@pulsar:dist_rpm:client-devel@.asc), [sha512](@pulsar:dist_rpm:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ rpm -ivh apache-pulsar-client*.rpm + +``` + +After you install RPM successfully, Pulsar libraries are in the `/usr/lib` directory. + +### Install Debian + +1. Download a Debian package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:deb:client@) | [asc](@pulsar:dist_deb:client@.asc), [sha512](@pulsar:dist_deb:client@.sha512) | +| [client-devel](@pulsar:deb:client-devel@) | [asc](@pulsar:dist_deb:client-devel@.asc), [sha512](@pulsar:dist_deb:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ apt install ./apache-pulsar-client*.deb + +``` + +After you install DEB successfully, Pulsar libraries are in the `/usr/lib` directory. + +### Build + +> If you want to build RPM and Debian packages from the latest master, follow the instructions below. You should run all the instructions at the root directory of your cloned Pulsar repository. + +There are recipes that build RPM and Debian packages containing a +statically linked `libpulsar.so` / `libpulsarnossl.so` / `libpulsar.a` / `libpulsarwithdeps.a` with all required dependencies. + +To build the C++ library packages, you need to build the Java packages first. + +```shell + +mvn install -DskipTests + +``` + +#### RPM + +To build the RPM inside a Docker container, use the command below. The RPMs are in the `pulsar-client-cpp/pkg/rpm/RPMS/x86_64/` path. + +```shell + +pulsar-client-cpp/pkg/rpm/docker-build-rpm.sh + +``` + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-devel | Static library `libpulsar.a`, `libpulsarwithdeps.a`and C++ and C headers | +| pulsar-client-debuginfo | Debug symbols for `libpulsar.so` | + +#### Debian + +To build Debian packages, enter the following command. + +```shell + +pulsar-client-cpp/pkg/deb/docker-build-deb.sh + +``` + +Debian packages are created in the `pulsar-client-cpp/pkg/deb/BUILD/DEB/` path. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-dev | Static library `libpulsar.a`, `libpulsarwithdeps.a` and C++ and C headers | + +## MacOS + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +# OpenSSL installation +$ brew install openssl +$ export OPENSSL_INCLUDE_DIR=/usr/local/opt/openssl/include/ +$ export OPENSSL_ROOT_DIR=/usr/local/opt/openssl/ + +$ brew install protobuf boost boost-python log4cxx +# If you are using python3, you need to install boost-python3 + +# Google Test installation +$ git clone https://github.com/google/googletest.git +$ cd googletest +$ git checkout release-1.12.1 +$ cmake . +$ make install + +``` + +3. Compile the Pulsar client library in the repository that you cloned. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +### Install `libpulsar` + +Pulsar releases are available in the [Homebrew](https://brew.sh/) core repository. You can install the C++ client library with the following command. The package is installed with the library and headers. + +```shell + +brew install libpulsar + +``` + +## Connection URLs + +To connect Pulsar using client libraries, you need to specify a Pulsar protocol URL. + +Pulsar protocol URLs are assigned to specific clusters, you can use the Pulsar URI scheme. The default port is `6650`. The following is an example for localhost. + +```http + +pulsar://localhost:6650 + +``` + +In a Pulsar cluster in production, the URL looks as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use TLS authentication, you need to add `ssl`, and the default port is `6651`. The following is an example. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a consumer + +To use Pulsar as a consumer, you need to create a consumer on the C++ client. The following is an example. + +```c++ + +Client client("pulsar://localhost:6650"); + +Consumer consumer; +Result result = client.subscribe("my-topic", "my-subscription-name", consumer); +if (result != ResultOk) { + LOG_ERROR("Failed to subscribe: " << result); + return -1; +} + +Message msg; + +while (true) { + consumer.receive(msg); + LOG_INFO("Received: " << msg + << " with payload '" << msg.getDataAsString() << "'"); + + consumer.acknowledge(msg); +} + +client.close(); + +``` + +## Create a producer + +To use Pulsar as a producer, you need to create a producer on the C++ client. The following is an example. + +```c++ + +Client client("pulsar://localhost:6650"); + +Producer producer; +Result result = client.createProducer("my-topic", producer); +if (result != ResultOk) { + LOG_ERROR("Error creating producer: " << result); + return -1; +} + +// Publish 10 messages to the topic +for (int i = 0; i < 10; i++){ + Message msg = MessageBuilder().setContent("my-message").build(); + Result res = producer.send(msg); + LOG_INFO("Message sent: " << res); +} +client.close(); + +``` + +## Enable authentication in connection URLs +If you use TLS authentication when connecting to Pulsar, you need to add `ssl` in the connection URLs, and the default port is `6651`. The following is an example. + +```cpp + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://my-broker.com:6651", config); + +``` + +For complete examples, refer to [C++ client examples](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/examples). + +## Schema + +This section describes some examples about schema. For more information about schema, see [Pulsar schema](schema-get-started.md). + +### Create producer with Avro schema + +The following example shows how to create a producer with an Avro schema. + +```cpp + +static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; +Producer producer; +ProducerConfiguration producerConf; +producerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); +client.createProducer("topic-avro", producerConf, producer); + +``` + +### Create consumer with Avro schema + +The following example shows how to create a consumer with an Avro schema. + +```cpp + +static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; +ConsumerConfiguration consumerConf; +Consumer consumer; +consumerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); +client.subscribe("topic-avro", "sub-2", consumerConf, consumer) + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-dotnet.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-dotnet.md new file mode 100644 index 0000000000000..b574fa0b2e5ed --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-dotnet.md @@ -0,0 +1,434 @@ +--- +id: client-libraries-dotnet +title: Pulsar C# client +sidebar_label: "C#" +original_id: client-libraries-dotnet +--- + +You can use the Pulsar C# client (DotPulsar) to create Pulsar producers and consumers in C#. All the methods in the producer, consumer, and reader of a C# client are thread-safe. The official documentation for DotPulsar is available [here](https://github.com/apache/pulsar-dotpulsar/wiki). + +## Installation + +You can install the Pulsar C# client library either through the dotnet CLI or through the Visual Studio. This section describes how to install the Pulsar C# client library through the dotnet CLI. For information about how to install the Pulsar C# client library through the Visual Studio , see [here](https://docs.microsoft.com/en-us/visualstudio/mac/nuget-walkthrough?view=vsmac-2019). + +### Prerequisites + +Install the [.NET Core SDK](https://dotnet.microsoft.com/download/), which provides the dotnet command-line tool. Starting in Visual Studio 2017, the dotnet CLI is automatically installed with any .NET Core related workloads. + +### Procedures + +To install the Pulsar C# client library, following these steps: + +1. Create a project. + + 1. Create a folder for the project. + + 2. Open a terminal window and switch to the new folder. + + 3. Create the project using the following command. + + ``` + + dotnet new console + + ``` + + 4. Use `dotnet run` to test that the app has been created properly. + +2. Add the DotPulsar NuGet package. + + 1. Use the following command to install the `DotPulsar` package. + + ``` + + dotnet add package DotPulsar + + ``` + + 2. After the command completes, open the `.csproj` file to see the added reference. + + ```xml + + + + + + ``` + +## Client + +This section describes some configuration examples for the Pulsar C# client. + +### Create client + +This example shows how to create a Pulsar C# client connected to localhost. + +```c# + +var client = PulsarClient.Builder().Build(); + +``` + +To create a Pulsar C# client by using the builder, you can specify the following options. + +| Option | Description | Default | +| ---- | ---- | ---- | +| ServiceUrl | Set the service URL for the Pulsar cluster. | pulsar://localhost:6650 | +| RetryInterval | Set the time to wait before retrying an operation or a reconnection. | 3s | + +### Create producer + +This section describes how to create a producer. + +- Create a producer by using the builder. + + ```c# + + var producer = client.NewProducer() + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a producer without using the builder. + + ```c# + + var options = new ProducerOptions("persistent://public/default/mytopic"); + var producer = client.CreateProducer(options); + + ``` + +### Create consumer + +This section describes how to create a consumer. + +- Create a consumer by using the builder. + + ```c# + + var consumer = client.NewConsumer() + .SubscriptionName("MySubscription") + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a consumer without using the builder. + + ```c# + + var options = new ConsumerOptions("MySubscription", "persistent://public/default/mytopic"); + var consumer = client.CreateConsumer(options); + + ``` + +### Create reader + +This section describes how to create a reader. + +- Create a reader by using the builder. + + ```c# + + var reader = client.NewReader() + .StartMessageId(MessageId.Earliest) + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a reader without using the builder. + + ```c# + + var options = new ReaderOptions(MessageId.Earliest, "persistent://public/default/mytopic"); + var reader = client.CreateReader(options); + + ``` + +### Configure encryption policies + +The Pulsar C# client supports four kinds of encryption policies: + +- `EnforceUnencrypted`: always use unencrypted connections. +- `EnforceEncrypted`: always use encrypted connections) +- `PreferUnencrypted`: use unencrypted connections, if possible. +- `PreferEncrypted`: use encrypted connections, if possible. + +This example shows how to set the `EnforceUnencrypted` encryption policy. + +```c# + +var client = PulsarClient.Builder() + .ConnectionSecurity(EncryptionPolicy.EnforceEncrypted) + .Build(); + +``` + +### Configure authentication + +Currently, the Pulsar C# client supports the TLS (Transport Layer Security) and JWT (JSON Web Token) authentication. + +If you have followed [Authentication using TLS](security-tls-authentication.md), you get a certificate and a key. To use them from the Pulsar C# client, follow these steps: + +1. Create an unencrypted and password-less pfx file. + + ```c# + + openssl pkcs12 -export -keypbe NONE -certpbe NONE -out admin.pfx -inkey admin.key.pem -in admin.cert.pem -passout pass: + + ``` + +2. Use the admin.pfx file to create an X509Certificate2 and pass it to the Pulsar C# client. + + ```c# + + var clientCertificate = new X509Certificate2("admin.pfx"); + var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + + ``` + +## Producer + +A producer is a process that attaches to a topic and publishes messages to a Pulsar broker for processing. This section describes some configuration examples about the producer. + +## Send data + +This example shows how to send data. + +```c# + +var data = Encoding.UTF8.GetBytes("Hello World"); +await producer.Send(data); + +``` + +### Send messages with customized metadata + +- Send messages with customized metadata by using the builder. + + ```c# + + var data = Encoding.UTF8.GetBytes("Hello World"); + var messageId = await producer.NewMessage() + .Property("SomeKey", "SomeValue") + .Send(data); + + ``` + +- Send messages with customized metadata without using the builder. + + ```c# + + var data = Encoding.UTF8.GetBytes("Hello World"); + var metadata = new MessageMetadata(); + metadata["SomeKey"] = "SomeValue"; + var messageId = await producer.Send(metadata, data)); + + ``` + +## Consumer + +A consumer is a process that attaches to a topic through a subscription and then receives messages. This section describes some configuration examples about the consumer. + +### Receive messages + +This example shows how a consumer receives messages from a topic. + +```c# + +await foreach (var message in consumer.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +### Acknowledge messages + +Messages can be acknowledged individually or cumulatively. For details about message acknowledgement, see [acknowledgement](concepts-messaging.md#acknowledgement). + +- Acknowledge messages individually. + + ```c# + + await foreach (var message in consumer.Messages()) + { + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); + } + + ``` + +- Acknowledge messages cumulatively. + + ```c# + + await consumer.AcknowledgeCumulative(message); + + ``` + +### Unsubscribe from topics + +This example shows how a consumer unsubscribes from a topic. + +```c# + +await consumer.Unsubscribe(); + +``` + +#### Note + +> A consumer cannot be used and is disposed once the consumer unsubscribes from a topic. + +## Reader + +A reader is actually just a consumer without a cursor. This means that Pulsar does not keep track of your progress and there is no need to acknowledge messages. + +This example shows how a reader receives messages. + +```c# + +await foreach (var message in reader.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +## Monitoring + +This section describes how to monitor the producer, consumer, and reader state. + +### Monitor producer + +The following table lists states available for the producer. + +| State | Description | +| ---- | ----| +| Closed | The producer or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | + +This example shows how to monitor the producer state. + +```c# + +private static async ValueTask Monitor(IProducer producer, CancellationToken cancellationToken) +{ + var state = ProducerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await producer.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ProducerState.Connected => $"The producer is connected", + ProducerState.Disconnected => $"The producer is disconnected", + ProducerState.Closed => $"The producer has closed", + ProducerState.Faulted => $"The producer has faulted", + _ => $"The producer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (producer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor consumer state + +The following table lists states available for the consumer. + +| State | Description | +| ---- | ----| +| Active | All is well. | +| Inactive | All is well. The subscription type is `Failover` and you are not the active consumer. | +| Closed | The consumer or the Pulsar client has been disposed. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | + +This example shows how to monitor the consumer state. + +```c# + +private static async ValueTask Monitor(IConsumer consumer, CancellationToken cancellationToken) +{ + var state = ConsumerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await consumer.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ConsumerState.Active => "The consumer is active", + ConsumerState.Inactive => "The consumer is inactive", + ConsumerState.Disconnected => "The consumer is disconnected", + ConsumerState.Closed => "The consumer has closed", + ConsumerState.ReachedEndOfTopic => "The consumer has reached end of topic", + ConsumerState.Faulted => "The consumer has faulted", + _ => $"The consumer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (consumer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor reader state + +The following table lists states available for the reader. + +| State | Description | +| ---- | ----| +| Closed | The reader or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | + +This example shows how to monitor the reader state. + +```c# + +private static async ValueTask Monitor(IReader reader, CancellationToken cancellationToken) +{ + var state = ReaderState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await reader.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ReaderState.Connected => "The reader is connected", + ReaderState.Disconnected => "The reader is disconnected", + ReaderState.Closed => "The reader has closed", + ReaderState.ReachedEndOfTopic => "The reader has reached end of topic", + ReaderState.Faulted => "The reader has faulted", + _ => $"The reader has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (reader.IsFinalState(state)) + return; + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-go.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-go.md new file mode 100644 index 0000000000000..d35738fce86f0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-go.md @@ -0,0 +1,885 @@ +--- +id: client-libraries-go +title: Pulsar Go client +sidebar_label: "Go" +original_id: client-libraries-go +--- + +> Tips: Currently, the CGo client will be deprecated, if you want to know more about the CGo client, please refer to [CGo client docs](client-libraries-cgo.md) + +You can use Pulsar [Go client](https://github.com/apache/pulsar-client-go) to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar-client-go/pulsar). + + +## Installation + +### Install go package + +You can install the `pulsar` library locally using `go get`. + +```bash + +$ go get -u "github.com/apache/pulsar-client-go/pulsar" + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, you can set the URL as below. + +``` + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "time" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +If you have multiple brokers, you can initiate a client object as below. + +```go + +import ( + "log" + "time" + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650,localhost:6651,localhost:6652", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +The following configurable parameters are available for Pulsar clients: + + Name | Description | Default +| :-------- | :---------- |:---------- | +| URL | Configure the service URL for the Pulsar service.

    If you have multiple brokers, you can set multiple Pulsar cluster addresses for a client.

    This parameter is **required**. |None | +| ConnectionTimeout | Timeout for the establishment of a TCP connection | 30s | +| OperationTimeout| Set the operation timeout. Producer-create, subscribe and unsubscribe operations will be retried until this interval, after which the operation will be marked as failed| 30s| +| Authentication | Configure the authentication provider. Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | no authentication | +| TLSTrustCertsFilePath | Set the path to the trusted TLS certificate file | | +| TLSAllowInsecureConnection | Configure whether the Pulsar client accept untrusted TLS certificate from broker | false | +| TLSValidateHostname | Configure whether the Pulsar client verify the validity of the host name from broker | false | +| ListenerName | Configure the net model for VPC users to connect to the Pulsar broker | | +| MaxConnectionsPerBroker | Max number of connections to a single broker that is kept in the pool | 1 | +| CustomMetricsLabels | Add custom labels to all the metrics reported by this client instance | | +| Logger | Configure the logger used by the client | logrus.StandardLogger | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatal(err) +} + +_, err = producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte("hello"), +}) + +defer producer.Close() + +if err != nil { + fmt.Println("Failed to publish message", err) +} +fmt.Println("Published message") + +``` + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, *ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | (MessageID, error) +`SendAsync(context.Context, *ProducerMessage, func(MessageID, *ProducerMessage, error))`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | + +### Producer Example + +#### How to use message router in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: serviceURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +// Only subscribe on the specific partition +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-partitioned-topic-partition-2", + SubscriptionName: "my-sub", +}) + +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-partitioned-topic", + MessageRouter: func(msg *ProducerMessage, tm TopicMetadata) int { + fmt.Println("Routing message ", msg, " -- Partitions: ", tm.NumPartitions()) + return 2 + }, +}) + +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +``` + +#### How to use schema interface in producer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +properties := make(map[string]string) +properties["pulsar"] = "hello" +jsonSchemaWithProperties := NewJSONSchema(exampleSchemaDef, properties) +producer, err := client.CreateProducer(ProducerOptions{ + Topic: "jsonTopic", + Schema: jsonSchemaWithProperties, +}) +assert.Nil(t, err) + +_, err = producer.Send(context.Background(), &ProducerMessage{ + Value: &testJSON{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +producer.Close() + +``` + +#### How to use delay relative in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topicName := newTopicName() +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicName, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topicName, + SubscriptionName: "subName", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +ID, err := producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("test")), + DeliverAfter: 3 * time.Second, +}) +if err != nil { + log.Fatal(err) +} +fmt.Println(ID) + +ctx, canc := context.WithTimeout(context.Background(), 1*time.Second) +msg, err := consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +ctx, canc = context.WithTimeout(context.Background(), 5*time.Second) +msg, err = consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +``` + +### Producer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name specify a name for the producer. If not assigned, the system will generate a globally unique name which can be access with Producer.ProducerName(). | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| SendTimeout | SendTimeout set the timeout for a message that is not acknowledged by the server | 30s | +| DisableBlockIfQueueFull | DisableBlockIfQueueFull control whether Send and SendAsync block if producer's message queue is full | false | +| MaxPendingMessages| MaxPendingMessages set the max size of the queue holding the messages pending to receive an acknowledgment from the broker. | | +| HashingScheme | HashingScheme change the `HashingScheme` used to chose the partition on where to publish a particular message. | JavaStringHash | +| CompressionType | CompressionType set the compression type for the producer. | not compressed | +| CompressionLevel | Define the desired compression level. Options: Default, Faster and Better | Default | +| MessageRouter | MessageRouter set a custom message routing policy by passing an implementation of MessageRouter | | +| DisableBatching | DisableBatching control whether automatic batching of messages is enabled for the producer. | false | +| BatchingMaxPublishDelay | BatchingMaxPublishDelay set the time period within which the messages sent will be batched | 1ms | +| BatchingMaxMessages | BatchingMaxMessages set the maximum number of messages permitted in a batch. | 1000 | +| BatchingMaxSize | BatchingMaxSize sets the maximum number of bytes permitted in a batch. | 128KB | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ProducerInterceptor` interface. | None | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker | ultimate | +| BatcherBuilderType | BatcherBuilderType sets the batch builder type. This is used to create a batch container when batching is enabled. Options: DefaultBatchBuilder and KeyBasedBatchBuilder | DefaultBatchBuilder | + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-sub", + Type: pulsar.Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +for i := 0; i < 10; i++ { + msg, err := consumer.Receive(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) +} + +if err := consumer.Unsubscribe(); err != nil { + log.Fatal(err) +} + +``` + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Chan()` | Chan returns a channel from which to consume messages. | `<-chan ConsumerMessage` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | +`ReconsumeLater(msg Message, delay time.Duration)` | ReconsumeLater mark a message for redelivery after custom delay | +`Nack(Message)` | Acknowledge the failure to process a single message. | +`NackID(MessageID)` | Acknowledge the failure to process a single message. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this consumer to a specific message publish time. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | +`Name()` | Name returns the name of consumer | `string` + +### Receive example + +#### How to use regex consumer + +```go + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) + +defer client.Close() + +p, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicInRegex, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer p.Close() + +topicsPattern := fmt.Sprintf("persistent://%s/foo.*", namespace) +opts := pulsar.ConsumerOptions{ + TopicsPattern: topicsPattern, + SubscriptionName: "regex-sub", +} +consumer, err := client.Subscribe(opts) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use multi topics Consumer + +```go + +func newTopicName() string { + return fmt.Sprintf("my-topic-%v", time.Now().Nanosecond()) +} + + +topic1 := "topic-1" +topic2 := "topic-2" + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +topics := []string{topic1, topic2} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topics: topics, + SubscriptionName: "multi-topic-sub", +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use consumer listener + +```go + +import ( + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + channel := make(chan pulsar.ConsumerMessage, 100) + + options := pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-subscription", + Type: pulsar.Shared, + } + + options.MessageChannel = channel + + consumer, err := client.Subscribe(options) + if err != nil { + log.Fatal(err) + } + + defer consumer.Close() + + // Receive messages from channel. The channel returns a struct which contains message and the consumer from where + // the message was received. It's not necessary here since we have 1 single consumer, but the channel could be + // shared across multiple consumers as well + for cm := range channel { + msg := cm.Message + fmt.Printf("Received message msgId: %v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) + } +} + +``` + +#### How to use consumer receive timeout + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "test-topic-with-no-messages" +ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) +defer cancel() + +// create consumer +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topic, + SubscriptionName: "my-sub1", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +msg, err := consumer.Receive(ctx) +fmt.Println(msg.Payload()) +if err != nil { + log.Fatal(err) +} + +``` + +#### How to use schema in consumer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +var s testJSON + +consumerJS := NewJSONSchema(exampleSchemaDef, nil) +consumer, err := client.Subscribe(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-1", + Schema: consumerJS, + SubscriptionInitialPosition: SubscriptionPositionEarliest, +}) +assert.Nil(t, err) +msg, err := consumer.Receive(context.Background()) +assert.Nil(t, err) +err = msg.GetSchemaValue(&s) +if err != nil { + log.Fatal(err) +} + +defer consumer.Close() + +``` + +### Consumer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Topics | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing| | +| TopicsPattern | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | | +| AutoDiscoveryPeriod | Specify the interval in which to poll for new partitions or new topics if using a TopicsPattern. | | +| SubscriptionName | Specify the subscription name for this consumer. This argument is required when subscribing | | +| Name | Set the consumer name | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| Type | Select the subscription type to be used when subscribing to the topic. | Exclusive | +| SubscriptionInitialPosition | InitialPosition at which the cursor will be set when subscribe | Latest | +| DLQ | Configuration for Dead Letter Queue consumer policy. | no DLQ | +| MessageChannel | Sets a `MessageChannel` for the consumer. When a message is received, it will be pushed to the channel for consumption | | +| ReceiverQueueSize | Sets the size of the consumer receive queue. | 1000| +| NackRedeliveryDelay | The delay after which to redeliver the messages that failed to be processed | 1min | +| ReadCompacted | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic | false | +| ReplicateSubscriptionState | Mark the subscription as replicated to keep it in sync across clusters | false | +| KeySharedPolicy | Configuration for Key Shared consumer policy. | | +| RetryEnable | Auto retry send messages to default filled DLQPolicy topics | false | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ConsumerInterceptor` interface. | | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker. | ultimate | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), +}) +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +``` + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` +`Seek(MessageID)` | Reset the subscription associated with this reader to a specific message ID | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this reader to a specific message publish time | `error` + +### Reader example + +#### How to use reader to read 'next' message + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), + }) + if err != nil { + log.Fatal(err) + } + defer reader.Close() + + for reader.HasNext() { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.DeserializeMessageID(lastSavedId), +}) + +``` + +#### How to use reader to read specific message + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: lookupURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "topic-1" +ctx := context.Background() + +// create producer +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topic, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +// send 10 messages +msgIDs := [10]MessageID{} +for i := 0; i < 10; i++ { + msgID, err := producer.Send(ctx, &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("hello-%d", i)), + }) + assert.NoError(t, err) + assert.NotNil(t, msgID) + msgIDs[i] = msgID +} + +// create reader on 5th message (not included) +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], +}) + +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +// receive the remaining 5 messages +for i := 5; i < 10; i++ { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) +} + +// create reader on 5th message (included) +readerInclusive, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], + StartMessageIDInclusive: true, +}) + +if err != nil { + log.Fatal(err) +} +defer readerInclusive.Close() + +``` + +### Reader configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name set the reader name. | | +| Properties | Attach a set of application defined properties to the reader. This properties will be visible in the topic stats | | +| StartMessageID | StartMessageID initial reader positioning is done by specifying a message id. | | +| StartMessageIDInclusive | If true, the reader will start at the `StartMessageID`, included. Default is `false` and the reader will start from the "next" message | false | +| MessageChannel | MessageChannel sets a `MessageChannel` for the consumer When a message is received, it will be pushed to the channel for consumption| | +| ReceiverQueueSize | ReceiverQueueSize sets the size of the consumer receive queue. | 1000 | +| SubscriptionRolePrefix| SubscriptionRolePrefix set the subscription role prefix. | “reader” | +| ReadCompacted | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. ReadCompacted can only be enabled when reading from a persistent topic. | false| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if _, err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`OrderingKey` | OrderingKey sets the ordering key of the message. +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message +`DeliverAfter` | Request to deliver the message only after the specified relative delay +`DeliverAt` | Deliver the message only at or after the specified absolute timestamp + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## OAuth2 authentication + +To use [OAuth2 authentication](security-oauth2.md), you'll need to configure your client to perform the following operations. +This example shows how to configure OAuth2 authentication. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-java.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-java.md new file mode 100644 index 0000000000000..a485a198d81d2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-java.md @@ -0,0 +1,1035 @@ +--- +id: client-libraries-java +title: Pulsar Java client +sidebar_label: "Java" +original_id: client-libraries-java +--- + +You can use Pulsar Java client to create Java [producer](#producer), [consumer](#consumer), and [readers](#reader-interface) of messages and to perform [administrative tasks](admin-api-overview.md). The current version of the Java client is **@pulsar:version@**. + +All the methods in [producer](#producer), [consumer](#consumer), and [reader](#reader) of a Java client are thread-safe. + +Javadoc for the Pulsar client is divided into two domains by package as follows. + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | The producer and consumer API | [org.apache.pulsar:pulsar-client:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7C@pulsar:version@%7Cjar) +`org.apache.pulsar.client.all` |Includes both `pulsar-client` and `pulsar-client-admin`

    Both `pulsar-client` and `pulsar-client-admin` are shaded packages and they shade dependencies independently. Consequently, the applications using both `pulsar-client` and `pulsar-client-admin` have redundant shaded classes. It would be troublesome if you introduce new dependencies but forget to update shading rules.

    In this case, you can use `pulsar-client-all`, which shades dependencies only one time and reduces the size of dependencies. |[org.apache.pulsar:pulsar-client-all:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-all%7C@pulsar:version@%7Cjar) + +This document focuses only on the client API for producing and consuming messages on Pulsar topics. For how to use the Java admin client, see [Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +### Maven + +If you use Maven, add the following information to the `pom.xml` file. + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + + +``` + +### Gradle + +If you use Gradle, add the following information to the `build.gradle` file. + +```groovy + +def pulsarVersion = '@pulsar:version@' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +You can assign Pulsar protocol URLs to specific clusters and use the `pulsar` scheme. The default port is `6650`. The following is an example of `localhost`. + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, the URL is as follows. + +```http + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster is as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use [TLS](security-tls-authentication.md) authentication, the URL is as follows. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Client + +You can instantiate a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object using just a URL for the target Pulsar [cluster](reference-terminology.md#cluster) like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +``` + +If you have multiple brokers, you can initiate a PulsarClient like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650,localhost:6651,localhost:6652") + .build(); + +``` + +> ### Default broker URLs for standalone clusters +> If you run a cluster in [standalone mode](getting-started-standalone.md), the broker is available at the `pulsar://localhost:6650` URL by default. + +If you create a client, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Type | Name |
    Description
    | Default +|---|---|---|--- +String | `serviceUrl` |Service URL provider for Pulsar service | None +String | `authPluginClassName` | Name of the authentication plugin | None +String | `authParams` | String represents parameters for the authentication plugin

    **Example**
    key1:val1,key2:val2|None +long|`operationTimeoutMs`|Operation timeout |30000 +long|`statsIntervalSeconds`|Interval between each stats info

    Stats is activated with positive `statsInterval`

    Set `statsIntervalSeconds` to 1 second at least |60 +int|`numIoThreads`| The number of threads used for handling connections to brokers | 1 +int|`numListenerThreads`|The number of threads used for handling message listeners. The listener thread pool is shared across all the consumers and readers using the "listener" model to get messages. For a given consumer, the listener is always invoked from the same thread to ensure ordering. If you want multiple threads to process a single topic, you need to create a [`shared`](https://pulsar.apache.org/docs/en/next/concepts-messaging/#shared) subscription and multiple consumers for this subscription. This does not ensure ordering.| 1 +boolean|`useTcpNoDelay`|Whether to use TCP no-delay flag on the connection to disable Nagle algorithm |true +boolean |`useTls` |Whether to use TLS encryption on the connection| false +string | `tlsTrustCertsFilePath` |Path to the trusted TLS certificate file|None +boolean|`tlsAllowInsecureConnection`|Whether the Pulsar client accepts untrusted TLS certificate from broker | false +boolean | `tlsHostnameVerificationEnable` | Whether to enable TLS hostname verification|false +int|`concurrentLookupRequest`|The number of concurrent lookup requests allowed to send on each broker connection to prevent overload on broker|5000 +int|`maxLookupRequest`|The maximum number of lookup requests allowed on each broker connection to prevent overload on broker | 50000 +int|`maxNumberOfRejectedRequestPerConnection`|The maximum number of rejected requests of a broker in a certain time frame (30 seconds) after the current connection is closed and the client creates a new connection to connect to a different broker|50 +int|`keepAliveIntervalSeconds`|Seconds of keeping alive interval for each client broker connection|30 +int|`connectionTimeoutMs`|Duration of waiting for a connection to a broker to be established

    If the duration passes without a response from a broker, the connection attempt is dropped|10000 +int|`requestTimeoutMs`|Maximum duration for completing a request |60000 +int|`defaultBackoffIntervalNanos`| Default duration for a backoff interval | TimeUnit.MILLISECONDS.toNanos(100); +long|`maxBackoffIntervalNanos`|Maximum duration for a backoff interval|TimeUnit.SECONDS.toNanos(30) + +Check out the Javadoc for the {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} class for a full list of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configuring-producers) and [consumer](#configuring-consumers) specific configuration as described in sections below. + +## Producer + +In Pulsar, producers write messages to topics. Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object (as in the section [above](#client-configuration)), you can create a {@inject: javadoc:Producer:/client/org/apache/pulsar/client/api/Producer} for a specific Pulsar [topic](reference-terminology.md#topic). + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); + +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types by specifying a message [schema](#schemas). + +```java + +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); + +``` + +> Make sure that you close your producers, consumers, and clients when you do not need them. + +> ```java +> +> producer.close(); +> consumer.close(); +> client.close(); +> +> +> ``` + +> +> Close operations can also be asynchronous: + +> ```java +> +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")) +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return null; +> }); +> +> +> ``` + + +### Configure producer + +If you instantiate a `Producer` object by specifying only a topic name as the example above, use the default configuration for producer. + +If you create a producer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Type | Name|
    Description
    | Default +|---|---|---|--- +String| `topicName`| Topic name| null| +String|`producerName`|Producer name| null +long|`sendTimeoutMs`|Message send timeout in ms.

    If a message is not acknowledged by a server before the `sendTimeout` expires, an error occurs.|30000 +boolean|`blockIfQueueFull`|If it is set to `true`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer block, rather than failing and throwing errors.

    If it is set to `false`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer fail and `ProducerQueueIsFullError` exceptions occur.

    The `MaxPendingMessages` parameter determines the size of the outgoing message queue.|false +int|`maxPendingMessages`|The maximum size of a queue holding pending messages.

    For example, a message waiting to receive an acknowledgment from a [broker](reference-terminology.md#broker).

    By default, when the queue is full, all calls to the `Send` and `SendAsync` methods fail **unless** you set `BlockIfQueueFull` to `true`.|1000 +int|`maxPendingMessagesAcrossPartitions`|The maximum number of pending messages across partitions.

    Use the setting to lower the max pending messages for each partition ({@link #setMaxPendingMessages(int)}) if the total number exceeds the configured value.|50000 +MessageRoutingMode|`messageRoutingMode`|Message routing logic for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics).

    Apply the logic only when setting no key on messages.

    Available options are as follows:

  • `pulsar.RoundRobinDistribution`: round robin

  • `pulsar.UseSinglePartition`: publish all messages to a single partition

  • `pulsar.CustomPartition`: a custom partitioning scheme
  • |`pulsar.RoundRobinDistribution` +HashingScheme|`hashingScheme`|Hashing function determining the partition where you publish a particular message (**partitioned topics only**).

    Available options are as follows:

  • `pulsar.JavaStringHash`: the equivalent of `String.hashCode()` in Java

  • `pulsar.Murmur3_32Hash`: applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function

  • `pulsar.BoostHash`: applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library
  • |`HashingScheme.JavaStringHash` +ProducerCryptoFailureAction|`cryptoFailureAction`|Producer should take action when encryption fails.

  • **FAIL**: if encryption fails, unencrypted messages fail to send.

  • **SEND**: if encryption fails, unencrypted messages are sent.
  • |`ProducerCryptoFailureAction.FAIL` +long|`batchingMaxPublishDelayMicros`|Batching time period of sending messages.|TimeUnit.MILLISECONDS.toMicros(1) +int|batchingMaxMessages|The maximum number of messages permitted in a batch.|1000 +boolean|`batchingEnabled`|Enable batching of messages. |true +CompressionType|`compressionType`|Message data compression type used by a producer.

    Available options:
  • [`LZ4`](https://github.com/lz4/lz4)
  • [`ZLIB`](https://zlib.net/)
  • [`ZSTD`](https://facebook.github.io/zstd/)
  • [`SNAPPY`](https://google.github.io/snappy/)
  • | No compression + +You can configure parameters if you do not want to use the default configuration. + +For a full list, see the Javadoc for the {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder} class. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); + +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more information on specifying a routing mode using the Java client, see the [Partitioned Topics](cookbooks-partitioned.md) cookbook. + +### Async send + +You can publish messages [asynchronously](concepts-messaging.md#send-modes) using the Java client. With async send, the producer puts the message in a blocking queue and returns it immediately. Then the client library sends the message to the broker in the background. If the queue is full (max size configurable), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. + +The following is an example. + +```java + +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.println("Message with ID " + msgId + " successfully sent"); +}); + +``` + +As you can see from the example above, async send operations return a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configure messages + +In addition to a value, you can set additional items on a given message: + +```java + +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); + +``` + +You can terminate the builder chain with `sendAsync()` and get a future return. + +## Consumer + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new [consumer](reference-terminology.md#consumer) by first instantiating a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object, you can create a {@inject: javadoc:Consumer:/client/org/apache/pulsar/client/api/Consumer} by specifying a [topic](reference-terminology.md#topic) and a [subscription](concepts-messaging.md#subscription-modes). + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); + +``` + +The `subscribe` method will auto subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any received message, and then [acknowledges](reference-terminology.md#acknowledgment-ack) that the message has been processed. If the processing logic fails, you can use [negative acknowledgement](reference-terminology.md#acknowledgment-ack) to redeliver the message later. + +```java + +while (true) { + // Wait for a message + Message msg = consumer.receive(); + + try { + // Do something with the message + System.out.println("Message received: " + new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); + } catch (Exception e) { + // Message failed to process, redeliver later + consumer.negativeAcknowledge(msg); + } +} + +``` + +If you don't want to block your main thread and rather listen constantly for new messages, consider using a `MessageListener`. + +```java + +MessageListener myMessageListener = (consumer, msg) -> { + try { + System.out.println("Message received: " + new String(msg.getData())); + consumer.acknowledge(msg); + } catch (Exception e) { + consumer.negativeAcknowledge(msg); + } +} + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .messageListener(myMessageListener) + .subscribe(); + +``` + +### Configure consumer + +If you instantiate a `Consumer` object by specifying only a topic and subscription name as in the example above, the consumer uses the default configuration. + +When you create a consumer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Type | Name|
    Description
    | Default +|---|---|---|--- +Set<String>| `topicNames`| Topic name| Sets.newTreeSet() +Pattern| `topicsPattern`| Topic pattern |None +String| `subscriptionName`| Subscription name| None +SubscriptionType| `subscriptionType`| Subscription type

    Four subscription types are available:
  • Exclusive
  • Failover
  • Shared
  • Key_Shared
  • |SubscriptionType.Exclusive +int | `receiverQueueSize` | Size of a consumer's receiver queue.

    For example, the number of messages accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.| 1000 +long|`acknowledgementsGroupTimeMicros`|Group a consumer acknowledgment for a specified time.

    By default, a consumer uses 100ms grouping time to send out acknowledgments to a broker.

    Setting a group time of 0 sends out acknowledgments immediately.

    A longer ack group time is more efficient at the expense of a slight increase in message re-deliveries after a failure.|TimeUnit.MILLISECONDS.toMicros(100) +long|`negativeAckRedeliveryDelayMicros`|Delay to wait before redelivering messages that failed to be processed.

    When an application uses {@link Consumer#negativeAcknowledge(Message)}, failed messages are redelivered after a fixed timeout. |TimeUnit.MINUTES.toMicros(1) +int |`maxTotalReceiverQueueSizeAcrossPartitions`|The max total receiver queue size across partitions.

    This setting reduces the receiver queue size for individual partitions if the total receiver queue size exceeds this value.|50000 +String|`consumerName`|Consumer name|null +long|`ackTimeoutMillis`|Timeout of unacked messages|0 +long|`tickDurationMillis`|Granularity of the ack-timeout redelivery.

    Using an higher `tickDurationMillis` reduces the memory overhead to track messages when setting ack-timeout to a bigger value (for example, 1 hour).|1000 +int|`priorityLevel`|Priority level for a consumer to which a broker gives more priority while dispatching messages in Shared subscription type.

    The broker follows descending priorities. For example, 0=max-priority, 1, 2,...

    In shared subscription type, the broker **first dispatches messages to the max priority level consumers if they have permits**. Otherwise, the broker considers next priority level consumers.

    **Example 1**

    If a subscription has consumerA with `priorityLevel` 0 and consumerB with `priorityLevel` 1, then the broker **only dispatches messages to consumerA until it runs out permits** and then starts dispatching messages to consumerB.

    **Example 2**

    Consumer Priority, Level, Permits
    C1, 0, 2
    C2, 0, 1
    C3, 0, 1
    C4, 1, 2
    C5, 1, 1

    Order in which a broker dispatches messages to consumers is: C1, C2, C3, C1, C4, C5, C4.|0 +ConsumerCryptoFailureAction|`cryptoFailureAction`|Consumer should take action when it receives a message that can not be decrypted.

  • **FAIL**: this is the default option to fail messages until crypto succeeds.

  • **DISCARD**:silently acknowledge and not deliver message to an application.

  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

    The decompression of message fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.
  • |
  • ConsumerCryptoFailureAction.FAIL
  • +SortedMap|`properties`|A name or value property of this consumer.

    `properties` is application defined metadata attached to a consumer.

    When getting a topic stats, associate this metadata with the consumer stats for easier identification.|new TreeMap() +boolean|`readCompacted`|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    Only enabling `readCompacted` on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +SubscriptionInitialPosition|`subscriptionInitialPosition`|Initial position at which to set cursor when subscribing to a topic at first time.|SubscriptionInitialPosition.Latest +int|`patternAutoDiscoveryPeriod`|Topic auto discovery period when using a pattern for topic's consumer.

    The default and minimum value is 1 minute.|1 +RegexSubscriptionMode|`regexSubscriptionMode`|When subscribing to a topic using a regular expression, you can pick a certain type of topics.

  • **PersistentOnly**: only subscribe to persistent topics.

  • **NonPersistentOnly**: only subscribe to non-persistent topics.

  • **AllTopics**: subscribe to both persistent and non-persistent topics.
  • |RegexSubscriptionMode.PersistentOnly +DeadLetterPolicy|`deadLetterPolicy`|Dead letter policy for consumers.

    By default, some messages are probably redelivered many times, even to the extent that it never stops.

    By using the dead letter mechanism, messages have the max redelivery count. **When exceeding the maximum number of redeliveries, messages are sent to the Dead Letter Topic and acknowledged automatically**.

    You can enable the dead letter mechanism by setting `deadLetterPolicy`.

    **Example**

    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10).build())
    .subscribe();


    Default dead letter topic name is `{TopicName}-{Subscription}-DLQ`.

    To set a custom dead letter topic name:
    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10)
    .deadLetterTopic("your-topic-name").build())
    .subscribe();


    When specifying the dead letter policy while not specifying `ackTimeoutMillis`, you can set the ack timeout to 30000 millisecond.|None +boolean|`autoUpdatePartitions`|If `autoUpdatePartitions` is enabled, a consumer subscribes to partition increasement automatically.

    **Note**: this is only for partitioned consumers.|true +boolean|`replicateSubscriptionState`|If `replicateSubscriptionState` is enabled, a subscription state is replicated to geo-replicated clusters.|false + +You can configure parameters if you do not want to use the default configuration. For a full list, see the Javadoc for the {@inject: javadoc:ConsumerBuilder:/client/org/apache/pulsar/client/api/ConsumerBuilder} class. + +The following is an example. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); + +``` + +### Async receive + +The `receive` method receives messages synchronously (the consumer process is blocked until a message is available). You can also use [async receive](concepts-messaging.md#receive-modes), which returns a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object immediately once a new message is available. + +The following is an example. + +```java + +CompletableFuture asyncMessage = consumer.receiveAsync(); + +``` + +Async receive operations return a {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Batch receive + +Use `batchReceive` to receive multiple messages for each call. + +The following is an example. + +```java + +Messages messages = consumer.batchReceive(); +for (Object message : messages) { + // do something +} +consumer.acknowledge(messages) + +``` + +:::note + +Batch receive policy limits the number and bytes of messages in a single batch. You can specify a timeout to wait for enough messages. +The batch receive is completed if any of the following condition is met: enough number of messages, bytes of messages, wait timeout. + +```java + +Consumer consumer = client.newConsumer() +.topic("my-topic") +.subscriptionName("my-subscription") +.batchReceivePolicy(BatchReceivePolicy.builder() +.maxNumMessages(100) +.maxNumBytes(1024 * 1024) +.timeout(200, TimeUnit.MILLISECONDS) +.build()) +.subscribe(); + +``` + +The default batch receive policy is: + +```java + +BatchReceivePolicy.builder() +.maxNumMessage(-1) +.maxNumBytes(10 * 1024 * 1024) +.timeout(100, TimeUnit.MILLISECONDS) +.build(); + +``` + +::: + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](concepts-messaging.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The followings are some examples. + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); + +``` + +In the above example, the consumer subscribes to the `persistent` topics that can match the topic name pattern. If you want the consumer subscribes to all `persistent` and `non-persistent` topics that can match the topic name pattern, set `subscriptionTopicsMode` to `RegexSubscriptionMode.AllTopics`. + +```java + +Pattern pattern = Pattern.compile("public/default/.*"); +pulsarClient.newConsumer() + .subscriptionName("my-sub") + .topicsPattern(pattern) + .subscriptionTopicsMode(RegexSubscriptionMode.AllTopics) + .subscribe(); + +``` + +:::note + +By default, the `subscriptionTopicsMode` of the consumer is `PersistentOnly`. Available options of `subscriptionTopicsMode` are `PersistentOnly`, `NonPersistentOnly`, and `AllTopics`. + +::: + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java + +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topic( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); + +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. The following is an example. + +```java + +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(this::receiveMessageFromConsumer); + +private void receiveMessageFromConsumer(Object consumer) { + ((Consumer)consumer).receiveAsync().thenAccept(message -> { + // Do something with the received message + receiveMessageFromConsumer(consumer); + }); +} + +``` + +### Subscription types + +Pulsar has various [subscription types](concepts-messaging#subscription-types) to match different scenarios. A topic can have multiple subscriptions with different subscription types. However, a subscription can only have one subscription type at a time. + +A subscription is identical with the subscription name; a subscription name can specify only one subscription type at a time. To change the subscription type, you should first stop all consumers of this subscription. + +Different subscription types have different message distribution modes. This section describes the differences of subscription types and how to use them. + +In order to better describe their differences, assuming you have a topic named "my-topic", and the producer has published 10 messages. + +```java + +Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .enableBatching(false) + .create(); +// 3 messages with "key-1", 3 messages with "key-2", 2 messages with "key-3" and 2 messages with "key-4" +producer.newMessage().key("key-1").value("message-1-1").send(); +producer.newMessage().key("key-1").value("message-1-2").send(); +producer.newMessage().key("key-1").value("message-1-3").send(); +producer.newMessage().key("key-2").value("message-2-1").send(); +producer.newMessage().key("key-2").value("message-2-2").send(); +producer.newMessage().key("key-2").value("message-2-3").send(); +producer.newMessage().key("key-3").value("message-3-1").send(); +producer.newMessage().key("key-3").value("message-3-2").send(); +producer.newMessage().key("key-4").value("message-4-1").send(); +producer.newMessage().key("key-4").value("message-4-2").send(); + +``` + +#### Exclusive + +Create a new consumer and subscribe with the `Exclusive` subscription type. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Exclusive) + .subscribe() + +``` + +Only the first consumer is allowed to the subscription, other consumers receive an error. The first consumer receives all 10 messages, and the consuming order is the same as the producing order. + +:::note + +If topic is a partitioned topic, the first consumer subscribes to all partitioned topics, other consumers are not assigned with partitions and receive an error. + +::: + +#### Failover + +Create new consumers and subscribe with the`Failover` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +//conumser1 is the active consumer, consumer2 is the standby consumer. +//consumer1 receives 5 messages and then crashes, consumer2 takes over as an active consumer. + +``` + +Multiple consumers can attach to the same subscription, yet only the first consumer is active, and others are standby. When the active consumer is disconnected, messages will be dispatched to one of standby consumers, and the standby consumer then becomes active consumer. + +If the first active consumer is disconnected after receiving 5 messages, the standby consumer becomes active consumer. Consumer1 will receive: + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-2", "message-2-1") +("key-2", "message-2-2") + +``` + +consumer2 will receive: + +``` + +("key-2", "message-2-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +:::note + +If a topic is a partitioned topic, each partition has only one active consumer, messages of one partition are distributed to only one consumer, and messages of multiple partitions are distributed to multiple consumers. + +::: + +#### Shared + +Create new consumers and subscribe with `Shared` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +In shared subscription type, multiple consumers can attach to the same subscription and messages are delivered in a round robin distribution across consumers. + +If a broker dispatches only one message at a time, consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-3") +("key-2", "message-2-2") +("key-3", "message-3-1") +("key-4", "message-4-1") + +``` + +consumer2 receives the following information. + +``` + +("key-1", "message-1-2") +("key-2", "message-2-1") +("key-2", "message-2-3") +("key-3", "message-3-2") +("key-4", "message-4-2") + +``` + +`Shared` subscription is different from `Exclusive` and `Failover` subscription types. `Shared` subscription has better flexibility, but cannot provide order guarantee. + +#### Key_shared + +This is a new subscription type since 2.4.0 release. Create new consumers and subscribe with `Key_Shared` subscription type. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +`Key_Shared` subscription is like `Shared` subscription, all consumers can attach to the same subscription. But it is different from `Key_Shared` subscription, messages with the same key are delivered to only one consumer in order. The possible distribution of messages between different consumers (by default we do not know in advance which keys will be assigned to a consumer, but a key will only be assigned to a consumer at the same time). + +consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-3", "message-3-1") +("key-3", "message-3-2") + +``` + +consumer2 receives the following information. + +``` + +("key-2", "message-2-1") +("key-2", "message-2-2") +("key-2", "message-2-3") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +If batching is enabled at the producer side, messages with different keys are added to a batch by default. The broker will dispatch the batch to the consumer, so the default batch mechanism may break the Key_Shared subscription guaranteed message distribution semantics. The producer needs to use the `KeyBasedBatcher`. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batcherBuilder(BatcherBuilder.KEY_BASED) + .create(); + +``` + +Or the producer can disable batching. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .enableBatching(false) + .create(); + +``` + +:::note + +If the message key is not specified, messages without key are dispatched to one consumer in order by default. + +::: + +## Reader + +With the [reader interface](concepts-clients.md#reader-interface), Pulsar clients can "manually position" themselves within a topic and reading all messages from a specified message onward. The Pulsar API for Java enables you to create {@inject: javadoc:Reader:/client/org/apache/pulsar/client/api/Reader} objects by specifying a topic and a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId}. + +The following is an example. + +```java + +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} + +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader iterates over each message in the topic after the message is identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +### Configure reader +When you create a reader, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Type | Name |
    Description
    | Default +|---|---|---|--- +String|`topicName`|Topic name. |None +int|`receiverQueueSize`|Size of a consumer's receiver queue.

    For example, the number of messages that can be accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.|1000 +ReaderListener<T>|`readerListener`|A listener that is called for message received.|None +String|`readerName`|Reader name.|null +String| `subscriptionName`|Subscription name|When there is a single topic, the default subscription name is `"reader-" + 10-digit UUID`.
    When there are multiple topics, the default subscription name is `"multiTopicsReader-" + 10-digit UUID`. +String|`subscriptionRolePrefix`|Prefix of subscription role. |null +CryptoKeyReader|`cryptoKeyReader`|Interface that abstracts the access to a key store.|null +ConsumerCryptoFailureAction|`cryptoFailureAction`|Consumer should take action when it receives a message that can not be decrypted.

  • **FAIL**: this is the default option to fail messages until crypto succeeds.

  • **DISCARD**: silently acknowledge and not deliver message to an application.

  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

    The message decompression fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.
  • |
  • ConsumerCryptoFailureAction.FAIL
  • +boolean|`readCompacted`|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (for example, failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +boolean|`resetIncludeHead`|If set to true, the first message to be returned is the one specified by `messageId`.

    If set to false, the first message to be returned is the one next to the message specified by `messageId`.|false + +### Sticky key range reader + +In sticky key range reader, broker will only dispatch messages which hash of the message key contains by the specified key hash range. Multiple key hash ranges can be specified on a reader. + +The following is an example to create a sticky key range reader. + +```java + +pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.earliest) + .keyHashRange(Range.of(0, 10000), Range.of(20001, 30000)) + .create(); + +``` + +Total hash range size is 65536, so the max end of the range should be less than or equal to 65535. + +## Schema + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](schema-get-started.md) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producers) without specifying a schema, then the producer can only produce messages of type `byte[]`. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic(topic) + .create(); + +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the [topic](reference-terminology.md#topic). + +### AvroBaseStructSchema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java + +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} + +``` + +You could then create a `Producer` (or `Consumer`) like this: + +```java + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); + +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + + ``` + + Or, equivalently: + + ```java + + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + + ``` + +* `String` for normal UTF-8-encoded string data. Apply the schema using `Schema.STRING`: + + ```java + + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + + ``` + +* Create JSON schemas for POJOs using `Schema.JSON`. The following is an example. + + ```java + + Producer pojoProducer = client.newProducer(Schema.JSON(MyPojo.class)) + .topic("some-pojo-topic") + .create(); + + ``` + +* Generate Protobuf schemas using `Schema.PROTOBUF`. The following example shows how to create the Protobuf schema and use it to instantiate a new producer: + + ```java + + Producer protobufProducer = client.newProducer(Schema.PROTOBUF(MyProtobuf.class)) + .topic("some-protobuf-topic") + .create(); + + ``` + +* Define Avro schemas with `Schema.AVRO`. The following code snippet demonstrates how to create and use Avro schema. + + ```java + + Producer avroProducer = client.newProducer(Schema.AVRO(MyAvro.class)) + .topic("some-avro-topic") + .create(); + + ``` + +### ProtobufNativeSchema example + +For example of ProtobufNativeSchema, see [`SchemaDefinition` in `Complex type`](schema-understand.md#complex-type). + +## Authentication + +Pulsar currently supports three authentication schemes: [TLS](security-tls-authentication.md), [Athenz](security-athenz.md), and [Oauth2](security-oauth2.md). You can use the Pulsar Java client with all of them. + +### TLS Authentication + +To use [TLS](security-tls-authentication.md), `enableTls` method is deprecated and you need to use "pulsar+ssl://" in serviceUrl to enable, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); + +``` + +### Athenz + +To use [Athenz](security-athenz.md) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); + +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` + +### Oauth2 + +The following example shows how to use [Oauth2](security-oauth2.md) as an authentication provider for the Pulsar Java client. + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(this.issuerUrl, this.credentialsUrl, this.audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"...","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-node.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-node.md new file mode 100644 index 0000000000000..e24032946bdcd --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-node.md @@ -0,0 +1,643 @@ +--- +id: client-libraries-node +title: The Pulsar Node.js client +sidebar_label: "Node.js" +original_id: client-libraries-node +--- + +The Pulsar Node.js client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Node.js. + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Node.js client are thread-safe. + +For 1.3.0 or later versions, [type definitions](https://github.com/apache/pulsar-client-node/blob/master/index.d.ts) used in TypeScript are available. + +## Installation + +You can install the [`pulsar-client`](https://www.npmjs.com/package/pulsar-client) library via [npm](https://www.npmjs.com/). + +### Requirements +Pulsar Node.js client library is based on the C++ client library. +Follow [these instructions](client-libraries-cpp.md#compilation) and install the Pulsar C++ client library. + +### Compatibility + +Compatibility between each version of the Node.js client and the C++ client is as follows: + +| Node.js client | C++ client | +| :------------- | :------------- | +| 1.0.0 | 2.3.0 or later | +| 1.1.0 | 2.4.0 or later | +| 1.2.0 | 2.5.0 or later | + +If an incompatible version of the C++ client is installed, you may fail to build or run this library. + +### Installation using npm + +Install the `pulsar-client` library via [npm](https://www.npmjs.com/): + +```shell + +$ npm install pulsar-client + +``` + +:::note + +Also, this library works only in Node.js 10.x or later because it uses the [`node-addon-api`](https://github.com/nodejs/node-addon-api) module to wrap the C++ library. + +::: + +## Connection URLs +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here is an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you are using [TLS encryption](security-tls-transport.md) or [TLS Authentication](security-tls-authentication.md), the URL looks like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you first need a client object. You can create a client instance using a `new` operator and the `Client` method, passing in a client options object (more on configuration [below](#client-configuration)). + +Here is an example: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + await client.close(); +})(); + +``` + +### Client configuration + +The following configurable parameters are available for Pulsar clients: + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `serviceUrl` | The connection URL for the Pulsar cluster. See [above](#connection-urls) for more info. | | +| `authentication` | Configure the authentication provider. (default: no authentication). See [TLS Authentication](security-tls-authentication.md) for more info. | | +| `operationTimeoutSeconds` | The timeout for Node.js client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries occur until this threshold is reached, at which point the operation fails. | 30 | +| `ioThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker). | 1 | +| `messageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)). | 1 | +| `concurrentLookupRequest` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 50000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 50000 | +| `tlsTrustCertsFilePath` | The file path for the trusted TLS certificate. | | +| `tlsValidateHostname` | The boolean value of setup whether to enable TLS hostname verification. | `false` | +| `tlsAllowInsecureConnection` | The boolean value of setup whether the Pulsar client accepts untrusted TLS certificate from broker. | `false` | +| `statsIntervalInSeconds` | Interval between each stat info. Stats is activated with positive statsInterval. The value should be set to 1 second at least | 600 | +| `log` | A function that is used for logging. | `console.log` | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Node.js producers using a producer configuration object. + +Here is an example: + +```JavaScript + +const producer = await client.createProducer({ + topic: 'my-topic', +}); + +await producer.send({ + data: Buffer.from("Hello, Pulsar"), +}); + +await producer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar producer, the operation returns `Promise` object and get producer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Producer operations + +Pulsar Node.js producers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `send(Object)` | Publishes a [message](#messages) to the producer's topic. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object whose result is the message ID runs executor function. | `Promise` | +| `flush()` | Sends message from send queue to Pulsar broker. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object runs executor function. | `Promise` | +| `close()` | Closes the producer and releases all resources allocated to it. Once `close()` is called, no more messages are accepted from the publisher. This method returns a Promise object. It runs the executor function when all pending publish requests are persisted by Pulsar. If an error is thrown, no pending writes are retried. | `Promise` | +| `getProducerName()` | Getter method of the producer name. | `string` | +| `getTopic()` | Getter method of the name of the topic. | `string` | + +### Producer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer publishes messages. The topic format is `` or `//`. For example, `sample/ns1/my-topic`. | | +| `producerName` | A name for the producer. If you do not explicitly assign a name, Pulsar automatically generates a globally unique name. If you choose to explicitly assign a name, it needs to be unique across *all* Pulsar clusters, otherwise the creation operation throws an error. | | +| `sendTimeoutMs` | When publishing a message to a topic, the producer waits for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error is thrown. If you set `sendTimeoutMs` to -1, the timeout is set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30000 | +| `initialSequenceId` | The initial sequence ID of the message. When producer send message, add sequence ID to message. The ID is increased each time to send. | | +| `maxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `send` method fails *unless* `blockIfQueueFull` is set to `true`. | 1000 | +| `maxPendingMessagesAcrossPartitions` | The maximum size of the sum of partition's pending queue. | 50000 | +| `blockIfQueueFull` | If set to `true`, the producer's `send` method waits when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `maxPendingMessages` parameter); if set to `false` (the default), `send` operations fails and throw a error when the queue is full. | `false` | +| `messageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-messaging.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`RoundRobinDistribution`), or publishing all messages to a single partition (`UseSinglePartition`, the default). | `UseSinglePartition` | +| `hashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `JavaStringHash` (the equivalent of `String.hashCode()` in Java), `Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library). | `BoostHash` | +| `compressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), and [`Zlib`](https://zlib.net/), [ZSTD](https://github.com/facebook/zstd/), [SNAPPY](https://github.com/google/snappy/). | Compression None | +| `batchingEnabled` | If set to `true`, the producer send message as batch. | `true` | +| `batchingMaxPublishDelayMs` | The maximum time of delay sending message in batching. | 10 | +| `batchingMaxMessages` | The maximum size of sending message in each time of batching. | 1000 | +| `properties` | The metadata of producer. | | + +### Producer example + +This example creates a Node.js producer for the `my-topic` topic and sends 10 messages to that topic: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'my-topic', + }); + + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); +})(); + +``` + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Node.js consumers using a consumer configuration object. + +Here is an example: + +```JavaScript + +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', +}); + +const msg = await consumer.receive(); +console.log(msg.getData().toString()); +consumer.acknowledge(msg); + +await consumer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar consumer, the operation returns `Promise` object and get consumer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Consumer operations + +Pulsar Node.js consumers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `receive()` | Receives a single message from the topic. When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `receive(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `acknowledge(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message object. | `void` | +| `acknowledgeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID object. | `void` | +| `acknowledgeCumulative(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `acknowledgeCumulative` method returns void, and send the ack to the broker asynchronously. After that, the messages are *not* redelivered to the consumer. Cumulative acking can not be used with a [shared](concepts-messaging.md#shared) subscription type. | `void` | +| `acknowledgeCumulativeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message ID. | `void` | +| `negativeAcknowledge(Message)`| [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message object. | `void` | +| `negativeAcknowledgeId(MessageId)` | [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message ID object. | `void` | +| `close()` | Closes the consumer, disabling its ability to receive messages from the broker. | `Promise` | +| `unsubscribe()` | Unsubscribes the subscription. | `Promise` | + +### Consumer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar topic on which the consumer establishes a subscription and listen for messages. | | +| `topics` | The array of topics. | | +| `topicsPattern` | The regular expression for topics. | | +| `subscription` | The subscription name for this consumer. | | +| `subscriptionType` | Available options are `Exclusive`, `Shared`, `Key_Shared`, and `Failover`. | `Exclusive` | +| `subscriptionInitialPosition` | Initial position at which to set cursor when subscribing to a topic at first time. | `SubscriptionInitialPosition.Latest` | +| `ackTimeoutMs` | Acknowledge timeout in milliseconds. | 0 | +| `nAckRedeliverTimeoutMs` | Delay to wait before redelivering messages that failed to be processed. | 60000 | +| `receiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 | +| `receiverQueueSizeAcrossPartitions` | Set the max total receiver queue size across partitions. This setting is used to reduce the receiver queue size for individual partitions if the total exceeds this value. | 50000 | +| `consumerName` | The name of consumer. Currently(v2.4.1), [failover](concepts-messaging.md#failover) mode use consumer name in ordering. | | +| `properties` | The metadata of consumer. | | +| `listener`| A listener that is called for a message received. | | +| `readCompacted`| If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | false | + +### Consumer example + +This example creates a Node.js consumer with the `my-subscription` subscription on the `my-topic` topic, receives messages, prints the content that arrive, and acknowledges each message to the Pulsar broker for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + }); + + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); +})(); + +``` + +Instead a consumer can be created with `listener` to process messages. + +```JavaScript + +// Create a consumer +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + listener: (msg, msgConsumer) => { + console.log(msg.getData().toString()); + msgConsumer.acknowledge(msg); + }, +}); + +``` + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recently unacked message). You can [configure](#reader-configuration) Node.js readers using a reader configuration object. + +Here is an example: + +```JavaScript + +const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), +}); + +const msg = await reader.readNext(); +console.log(msg.getData().toString()); + +await reader.close(); + +``` + +### Reader operations + +Pulsar Node.js readers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `readNext()` | Receives the next message on the topic (analogous to the `receive` method for [consumers](#consumer-operations)). When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `readNext(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `hasNext()` | Return whether the broker has next message in target topic. | `Boolean` | +| `close()` | Closes the reader, disabling its ability to receive messages from the broker. | `Promise` | + +### Reader configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader establishes a subscription and listen for messages. | | +| `startMessageId` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `Pulsar.MessageId.earliest` (the earliest available message on the topic), `Pulsar.MessageId.latest` (the latest available message on the topic), or a message ID object for a position that is not earliest or latest. | | +| `receiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `readNext`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 | +| `readerName` | The name of the reader. | | +| `subscriptionRolePrefix` | The subscription role prefix. | | +| `readCompacted` | If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | `false` | + + +### Reader example + +This example creates a Node.js reader with the `my-topic` topic, reads messages, and prints the content that arrive for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a reader + const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), + }); + + // read messages + for (let i = 0; i < 10; i += 1) { + const msg = await reader.readNext(); + console.log(msg.getData().toString()); + } + + await reader.close(); + await client.close(); +})(); + +``` + +## Messages + +In Pulsar Node.js client, you have to construct producer message object for producer. + +Here is an example message: + +```JavaScript + +const msg = { + data: Buffer.from('Hello, Pulsar'), + partitionKey: 'key1', + properties: { + 'foo': 'bar', + }, + eventTimestamp: Date.now(), + replicationClusters: [ + 'cluster1', + 'cluster2', + ], +} + +await producer.send(msg); + +``` + +The following keys are available for producer message objects: + +| Parameter | Description | +| :-------- | :---------- | +| `data` | The actual data payload of the message. | +| `properties` | A Object for any application-specific metadata attached to the message. | +| `eventTimestamp` | The timestamp associated with the message. | +| `sequenceId` | The sequence ID of the message. | +| `partitionKey` | The optional key associated with the message (particularly useful for things like topic compaction). | +| `replicationClusters` | The clusters to which this message is replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. | +| `deliverAt` | The absolute timestamp at or after which the message is delivered. | | +| `deliverAfter` | The relative delay after which the message is delivered. | | + +### Message object operations + +In Pulsar Node.js client, you can receive (or read) message object as consumer (or reader). + +The message object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `getTopicName()` | Getter method of topic name. | `String` | +| `getProperties()` | Getter method of properties. | `Array` | +| `getData()` | Getter method of message data. | `Buffer` | +| `getMessageId()` | Getter method of [message id object](#message-id-object-operations). | `Object` | +| `getPublishTimestamp()` | Getter method of publish timestamp. | `Number` | +| `getEventTimestamp()` | Getter method of event timestamp. | `Number` | +| `getRedeliveryCount()` | Getter method of redelivery count. | `Number` | +| `getPartitionKey()` | Getter method of partition key. | `String` | + +### Message ID object operations + +In Pulsar Node.js client, you can get message id object from message object. + +The message id object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `serialize()` | Serialize the message id into a Buffer for storing. | `Buffer` | +| `toString()` | Get message id as String. | `String` | + +The client has static method of message id object. You can access it as `Pulsar.MessageId.someStaticMethod` too. + +The following static methods are available for the message id object: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `earliest()` | MessageId representing the earliest, or oldest available message stored in the topic. | `Object` | +| `latest()` | MessageId representing the latest, or last published message in the topic. | `Object` | +| `deserialize(Buffer)` | Deserialize a message id object from a Buffer. | `Object` | + +## End-to-end encryption + +[End-to-end encryption](https://pulsar.apache.org/docs/en/next/cookbooks-encryption/#docsNav) allows applications to encrypt messages at producers and decrypt at consumers. + +### Configuration + +If you want to use the end-to-end encryption feature in the Node.js client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Node.js client. + +**Prerequisite** + +- Pulsar C++ client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'persistent://public/default/my-topic', + sendTimeoutMs: 30000, + batchingEnabled: true, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem", + encryptionKey: "encryption-key" + }); + + console.log(producer.ProducerConfig) + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); + })(); + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://172.25.0.3:6650', + operationTimeoutSeconds: 30 + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'persistent://public/default/my-topic', + subscription: 'sub1', + subscriptionType: 'Shared', + ackTimeoutMs: 10000, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem" + }); + + console.log(consumer) + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); + })(); + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + node consumer.js + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + node producer.js + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + Sent message: my-message-0 + Sent message: my-message-1 + Sent message: my-message-2 + Sent message: my-message-3 + Sent message: my-message-4 + Sent message: my-message-5 + Sent message: my-message-6 + Sent message: my-message-7 + Sent message: my-message-8 + Sent message: my-message-9 + + ``` + + This is from the consumer side. + + ``` + + my-message-0 + my-message-1 + my-message-2 + my-message-3 + my-message-4 + my-message-5 + my-message-6 + my-message-7 + my-message-8 + my-message-9 + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-python.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-python.md new file mode 100644 index 0000000000000..394f6d28db2d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-python.md @@ -0,0 +1,483 @@ +--- +id: client-libraries-python +title: Pulsar Python client +sidebar_label: "Python" +original_id: client-libraries-python +--- + +Pulsar Python client library is a wrapper over the existing [C++ client library](client-libraries-cpp.md) and exposes all of the [same features](/api/cpp). You can find the code in the [`python` subdirectory](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/python) of the C++ client code. + +All the methods in producer, consumer, and reader of a Python client are thread-safe. + +[pdoc](https://github.com/BurntSushi/pdoc)-generated API docs for the Python client are available [here](/api/python). + +## Install + +You can install the [`pulsar-client`](https://pypi.python.org/pypi/pulsar-client) library either via [PyPi](https://pypi.python.org/pypi), using [pip](#installation-using-pip), or by building the library from source. + +### Install using pip + +To install the `pulsar-client` library as a pre-built package using the [pip](https://pip.pypa.io/en/stable/) package manager: + +```shell + +$ pip install pulsar-client==@pulsar:version_number@ + +``` + +### Optional dependencies + +To support aspects like pulsar functions or Avro serialization, additional optional components can be installed alongside the `pulsar-client` library + +```shell + +# avro serialization +$ pip install pulsar-client[avro]=='@pulsar:version_number@' + +# functions runtime +$ pip install pulsar-client[functions]=='@pulsar:version_number@' + +# all optional components +$ pip install pulsar-client[all]=='@pulsar:version_number@' + +``` + +Installation via PyPi is available for the following Python versions: + +Platform | Supported Python versions +:--------|:------------------------- +MacOS
    10.13 (High Sierra), 10.14 (Mojave)
    | 2.7, 3.7 +Linux | 2.7, 3.4, 3.5, 3.6, 3.7, 3.8 + +### Install from source + +To install the `pulsar-client` library by building from source, follow [instructions](client-libraries-cpp.md#compilation) and compile the Pulsar C++ client library. That builds the Python binding for the library. + +To install the built Python bindings: + +```shell + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/pulsar-client-cpp/python +$ sudo python setup.py install + +``` + +## API Reference + +The complete Python API reference is available at [api/python](/api/python). + +## Examples + +You can find a variety of Python code examples for the `pulsar-client` library. + +### Producer example + +The following example creates a Python producer for the `my-topic` topic and sends 10 messages on that topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('Hello-%d' % i).encode('utf-8')) + +client.close() + +``` + +### Consumer example + +The following example creates a consumer with the `my-subscription` subscription name on the `my-topic` topic, receives incoming messages, prints the content and ID of messages that arrive, and acknowledges each message to the Pulsar broker. + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +consumer = client.subscribe('my-topic', 'my-subscription') + +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +client.close() + +``` + +This example shows how to configure negative acknowledgement. + +```python + +from pulsar import Client, schema +client = Client('pulsar://localhost:6650') +consumer = client.subscribe('negative_acks','test',schema=schema.StringSchema()) +producer = client.create_producer('negative_acks',schema=schema.StringSchema()) +for i in range(10): + print('send msg "hello-%d"' % i) + producer.send_async('hello-%d' % i, callback=None) +producer.flush() +for i in range(10): + msg = consumer.receive() + consumer.negative_acknowledge(msg) + print('receive and nack msg "%s"' % msg.data()) +for i in range(10): + msg = consumer.receive() + consumer.acknowledge(msg) + print('receive and ack msg "%s"' % msg.data()) +try: + # No more messages expected + msg = consumer.receive(100) +except: + print("no more msg") + pass + +``` + +### Reader interface example + +You can use the Pulsar Python API to use the Pulsar [reader interface](concepts-clients.md#reader-interface). Here's an example: + +```python + +# MessageId taken from a previously fetched message +msg_id = msg.message_id() + +reader = client.create_reader('my-topic', msg_id) + +while True: + msg = reader.read_next() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # No acknowledgment + +``` + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously. To use multi-topic subscriptions, you can supply a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The following is an example. + +```python + +import re +consumer = client.subscribe(re.compile('persistent://public/default/topic-*'), 'my-subscription') +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) +client.close() + +``` + +## Schema + +### Declare and validate schema + +You can declare a schema by passing a class that inherits +from `pulsar.schema.Record` and defines the fields as +class variables. For example: + +```python + +from pulsar.schema import * + +class Example(Record): + a = String() + b = Integer() + c = Boolean() + +``` + +With this simple schema definition, you can create producers, consumers and readers instances that refer to that. + +```python + +producer = client.create_producer( + topic='my-topic', + schema=AvroSchema(Example) ) + +producer.send(Example(a='Hello', b=1)) + +``` + +After creating the producer, the Pulsar broker validates that the existing topic schema is indeed of "Avro" type and that the format is compatible with the schema definition of the `Example` class. + +If there is a mismatch, an exception occurs in the producer creation. + +Once a producer is created with a certain schema definition, +it will only accept objects that are instances of the declared +schema class. + +Similarly, for a consumer/reader, the consumer will return an +object, instance of the schema record class, rather than the raw +bytes: + +```python + +consumer = client.subscribe( + topic='my-topic', + subscription_name='my-subscription', + schema=AvroSchema(Example) ) + +while True: + msg = consumer.receive() + ex = msg.value() + try: + print("Received message a={} b={} c={}".format(ex.a, ex.b, ex.c)) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +``` + +### Supported schema types + +You can use different builtin schema types in Pulsar. All the definitions are in the `pulsar.schema` package. + +| Schema | Notes | +| ------ | ----- | +| `BytesSchema` | Get the raw payload as a `bytes` object. No serialization/deserialization are performed. This is the default schema mode | +| `StringSchema` | Encode/decode payload as a UTF-8 string. Uses `str` objects | +| `JsonSchema` | Require record definition. Serializes the record into standard JSON payload | +| `AvroSchema` | Require record definition. Serializes in AVRO format | + +### Schema definition reference + +The schema definition is done through a class that inherits from `pulsar.schema.Record`. + +This class has a number of fields which can be of either +`pulsar.schema.Field` type or another nested `Record`. All the +fields are specified in the `pulsar.schema` package. The fields +are matching the AVRO fields types. + +| Field Type | Python Type | Notes | +| ---------- | ----------- | ----- | +| `Boolean` | `bool` | | +| `Integer` | `int` | | +| `Long` | `int` | | +| `Float` | `float` | | +| `Double` | `float` | | +| `Bytes` | `bytes` | | +| `String` | `str` | | +| `Array` | `list` | Need to specify record type for items. | +| `Map` | `dict` | Key is always `String`. Need to specify value type. | + +Additionally, any Python `Enum` type can be used as a valid field type. + +#### Fields parameters + +When adding a field, you can use these parameters in the constructor. + +| Argument | Default | Notes | +| ---------- | --------| ----- | +| `default` | `None` | Set a default value for the field. Eg: `a = Integer(default=5)` | +| `required` | `False` | Mark the field as "required". It is set in the schema accordingly. | + +#### Schema definition examples + +##### Simple definition + +```python + +class Example(Record): + a = String() + b = Integer() + c = Array(String()) + i = Map(String()) + +``` + +##### Using enums + +```python + +from enum import Enum + +class Color(Enum): + red = 1 + green = 2 + blue = 3 + +class Example(Record): + name = String() + color = Color + +``` + +##### Complex types + +```python + +class MySubRecord(Record): + x = Integer() + y = Long() + z = String() + +class Example(Record): + a = String() + sub = MySubRecord() + +``` + +##### Set namespace for Avro schema + +:::note + +This setting is only available in 2.8.2 and later versions. + +::: + +Set the namespace for Avro Record schema using the special field `_avro_namespace`. +```python +class NamespaceDemo(Record): + _avro_namespace = 'xxx.xxx.xxx' + x = String() + y = Integer() +``` + +The schema definition is like this. +``` +{ + 'name': 'NamespaceDemo', 'namespace': 'xxx.xxx.xxx', 'type': 'record', 'fields': [ + {'name': 'x', 'type': ['null', 'string']}, + {'name': 'y', 'type': ['null', 'int']} + ] +} +``` + + +## End-to-end encryption + +[End-to-end encryption](https://pulsar.apache.org/docs/en/next/cookbooks-encryption/#docsNav) allows applications to encrypt messages at producers and decrypt messages at consumers. + +### Configuration + +To use the end-to-end encryption feature in the Python client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Python client. + +**Prerequisite** + +- Pulsar Python client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + producer = client.create_producer(topic='encryption', encryption_key='encryption', crypto_key_reader=crypto_key_reader) + producer.send('encryption message'.encode('utf8')) + print('sent message') + producer.close() + client.close() + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe(topic='encryption', subscription_name='encryption-sub', crypto_key_reader=crypto_key_reader) + msg = consumer.receive() + print("Received msg '{}' id = '{}'".format(msg.data(), msg.message_id())) + consumer.close() + client.close() + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + python consumer.py + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + python producer.py + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + sent message + + ``` + + This is from the consumer side. + + ``` + + Received msg 'b'encryption message'' id = '(0,0,-1,-1)' + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries-websocket.md b/site2/website/versioned_docs/version-2.8.x/client-libraries-websocket.md new file mode 100644 index 0000000000000..f905e413f4ca1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries-websocket.md @@ -0,0 +1,621 @@ +--- +id: client-libraries-websocket +title: Pulsar WebSocket API +sidebar_label: "WebSocket" +original_id: client-libraries-websocket +--- + +Pulsar [WebSocket](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API) API provides a simple way to interact with Pulsar using languages that do not have an official [client library](getting-started-clients.md). Through WebSocket, you can publish and consume messages and use features available on the [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + + +> You can use Pulsar WebSocket API with any WebSocket client library. See examples for Python and Node.js [below](#client-examples). + +## Running the WebSocket service + +The standalone variant of Pulsar that we recommend using for [local development](getting-started-standalone.md) already has the WebSocket service enabled. + +In non-standalone mode, there are two ways to deploy the WebSocket service: + +* [embedded](#embedded-with-a-pulsar-broker) with a Pulsar broker +* as a [separate component](#as-a-separate-component) + +### Embedded with a Pulsar broker + +In this mode, the WebSocket service will run within the same HTTP service that's already running in the broker. To enable this mode, set the [`webSocketServiceEnabled`](reference-configuration.md#broker-webSocketServiceEnabled) parameter in the [`conf/broker.conf`](reference-configuration.md#broker) configuration file in your installation. + +```properties + +webSocketServiceEnabled=true + +``` + +### As a separate component + +In this mode, the WebSocket service will be run from a Pulsar [broker](reference-terminology.md#broker) as a separate service. Configuration for this mode is handled in the [`conf/websocket.conf`](reference-configuration.md#websocket) configuration file. You'll need to set *at least* the following parameters: + +* [`configurationStoreServers`](reference-configuration.md#websocket-configurationStoreServers) +* [`webServicePort`](reference-configuration.md#websocket-webServicePort) +* [`clusterName`](reference-configuration.md#websocket-clusterName) + +Here's an example: + +```properties + +configurationStoreServers=zk1:2181,zk2:2181,zk3:2181 +webServicePort=8080 +clusterName=my-cluster + +``` + +### Security settings + +To enable TLS encryption on WebSocket service: + +```properties + +tlsEnabled=true +tlsAllowInsecureConnection=false +tlsCertificateFilePath=/path/to/client-websocket.cert.pem +tlsKeyFilePath=/path/to/client-websocket.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +### Starting the broker + +When the configuration is set, you can start the service using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) tool: + +```shell + +$ bin/pulsar-daemon start websocket + +``` + +## API Reference + +Pulsar's WebSocket API offers three endpoints for [producing](#producer-endpoint) messages, [consuming](#consumer-endpoint) messages and [reading](#reader-endpoint) messages. + +All exchanges via the WebSocket API use JSON. + +### Authentication + +#### Browser javascript WebSocket client + +Use the query param `token` transport the authentication token. + +```http + +ws://broker-service-url:8080/path?token=token + +``` + +### Producer endpoint + +The producer endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/producer/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`sendTimeoutMillis` | long | no | Send timeout (default: 30 secs) +`batchingEnabled` | boolean | no | Enable batching of messages (default: false) +`batchingMaxMessages` | int | no | Maximum number of messages permitted in a batch (default: 1000) +`maxPendingMessages` | int | no | Set the max size of the internal-queue holding the messages (default: 1000) +`batchingMaxPublishDelay` | long | no | Time period within which the messages will be batched (default: 10ms) +`messageRoutingMode` | string | no | Message [routing mode](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/ProducerConfiguration.MessageRoutingMode.html) for the partitioned producer: `SinglePartition`, `RoundRobinPartition` +`compressionType` | string | no | Compression [type](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/CompressionType.html): `LZ4`, `ZLIB` +`producerName` | string | no | Specify the name for the producer. Pulsar will enforce only one producer with same name can be publishing on a topic +`initialSequenceId` | long | no | Set the baseline for the sequence ids for messages published by the producer. +`hashingScheme` | string | no | [Hashing function](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.HashingScheme.html) to use when publishing on a partitioned topic: `JavaStringHash`, `Murmur3_32Hash` +`token` | string | no | Authentication token, this is used for the browser javascript client + + +#### Publishing a message + +```json + +{ + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "context": "1" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`payload` | string | yes | Base-64 encoded payload +`properties` | key-value pairs | no | Application-defined properties +`context` | string | no | Application-defined request identifier +`key` | string | no | For partitioned topics, decides which partition to use +`replicationClusters` | array | no | Restrict replication to this list of [clusters](reference-terminology.md#cluster), specified by name + + +##### Example success response + +```json + +{ + "result": "ok", + "messageId": "CAAQAw==", + "context": "1" + } + +``` + +##### Example failure response + +```json + + { + "result": "send-error:3", + "errorMsg": "Failed to de-serialize from JSON", + "context": "1" + } + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`result` | string | yes | `ok` if successful or an error message if unsuccessful +`messageId` | string | yes | Message ID assigned to the published message +`context` | string | no | Application-defined request identifier + + +### Consumer endpoint + +The consumer endpoint requires you to specify a tenant, namespace, and topic, as well as a subscription, in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/consumer/persistent/:tenant/:namespace/:topic/:subscription + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`ackTimeoutMillis` | long | no | Set the timeout for unacked messages (default: 0) +`subscriptionType` | string | no | [Subscription type](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/SubscriptionType.html): `Exclusive`, `Failover`, `Shared`, `Key_Shared` +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`consumerName` | string | no | Consumer name +`priorityLevel` | int | no | Define a [priority](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setPriorityLevel-int-) for the consumer +`maxRedeliverCount` | int | no | Define a [maxRedeliverCount](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: 0). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`deadLetterTopic` | string | no | Define a [deadLetterTopic](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: {topic}-{subscription}-DLQ). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`pullMode` | boolean | no | Enable pull mode (default: false). See "Flow Control" below. +`negativeAckRedeliveryDelay` | int | no | When a message is negatively acknowledged, the delay time before the message is redelivered (in milliseconds). The default value is 60000. +`token` | string | no | Authentication token, this is used for the browser javascript client + +NB: these parameter (except `pullMode`) apply to the internal consumer of the WebSocket service. +So messages will be subject to the redelivery settings as soon as the get into the receive queue, +even if the client doesn't consume on the WebSocket. + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785", + "redeliveryCount": 4 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`redeliveryCount` | number | yes | Number of times this message was already delivered +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +Consumer needs to acknowledge the successful processing of the message to +have the Pulsar broker delete it. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Negatively acknowledging messages + +```json + +{ + "type": "negativeAcknowledge", + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Flow control + +##### Push Mode + +By default (`pullMode=false`), the consumer endpoint will use the `receiverQueueSize` parameter both to size its +internal receive queue and to limit the number of unacknowledged messages that are passed to the WebSocket client. +In this mode, if you don't send acknowledgements, the Pulsar WebSocket service will stop sending messages after reaching +`receiverQueueSize` unacked messages sent to the WebSocket client. + +##### Pull Mode + +If you set `pullMode` to `true`, the WebSocket client will need to send `permit` commands to permit the +Pulsar WebSocket service to send more messages. + +```json + +{ + "type": "permit", + "permitMessages": 100 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `permit` +`permitMessages`| int | yes | Number of messages to permit + +NB: in this mode it's possible to acknowledge messages in a different connection. + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Reader endpoint + +The reader endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/reader/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`readerName` | string | no | Reader name +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`messageId` | int or enum | no | Message ID to start from, `earliest` or `latest` (default: `latest`) +`token` | string | no | Authentication token, this is used for the browser javascript client + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785", + "redeliveryCount": 4 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`redeliveryCount` | number | yes | Number of times this message was already delivered +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +**In WebSocket**, Reader needs to acknowledge the successful processing of the message to +have the Pulsar WebSocket service update the number of pending messages. +If you don't send acknowledgements, Pulsar WebSocket service will stop sending messages after reaching the pendingMessages limit. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Error codes + +In case of error the server will close the WebSocket session using the +following error codes: + +Error Code | Error Message +:----------|:------------- +1 | Failed to create producer +2 | Failed to subscribe +3 | Failed to deserialize from JSON +4 | Failed to serialize to JSON +5 | Failed to authenticate client +6 | Client is not authorized +7 | Invalid payload encoding +8 | Unknown error + +> The application is responsible for re-establishing a new WebSocket session after a backoff period. + +## Client examples + +Below you'll find code examples for the Pulsar WebSocket API in [Python](#python) and [Node.js](#nodejs). + +### Python + +This example uses the [`websocket-client`](https://pypi.python.org/pypi/websocket-client) package. You can install it using [pip](https://pypi.python.org/pypi/pip): + +```shell + +$ pip install websocket-client + +``` + +You can also download it from [PyPI](https://pypi.python.org/pypi/websocket-client). + +#### Python producer + +Here's an example Python producer that sends a simple message to a Pulsar [topic](reference-terminology.md#topic): + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/producer/persistent/public/default/my-topic' + +ws = websocket.create_connection(TOPIC) + +# Send one message as JSON +ws.send(json.dumps({ + 'payload' : base64.b64encode('Hello World'), + 'properties': { + 'key1' : 'value1', + 'key2' : 'value2' + }, + 'context' : 5 +})) + +response = json.loads(ws.recv()) +if response['result'] == 'ok': + print 'Message published successfully' +else: + print 'Failed to publish message:', response +ws.close() + +``` + +#### Python consumer + +Here's an example Python consumer that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub' + +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload'])) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +#### Python reader + +Here's an example Python reader that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/reader/persistent/public/default/my-topic' +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload'])) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +### Node.js + +This example uses the [`ws`](https://websockets.github.io/ws/) package. You can install it using [npm](https://www.npmjs.com/): + +```shell + +$ npm install ws + +``` + +#### Node.js producer + +Here's an example Node.js producer that sends a simple message to a Pulsar topic: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/producer/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +var message = { + "payload" : new Buffer("Hello World").toString('base64'), + "properties": { + "key1" : "value1", + "key2" : "value2" + }, + "context" : "1" +}; + +ws.on('open', function() { + // Send one message + ws.send(JSON.stringify(message)); +}); + +ws.on('message', function(message) { + console.log('received ack: %s', message); +}); + +``` + +#### Node.js consumer + +Here's an example Node.js consumer that listens on the same topic used by the producer above: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + +#### NodeJS reader + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/reader/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/client-libraries.md b/site2/website/versioned_docs/version-2.8.x/client-libraries.md new file mode 100644 index 0000000000000..00d128c514040 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/client-libraries.md @@ -0,0 +1,35 @@ +--- +id: client-libraries +title: Pulsar client libraries +sidebar_label: "Overview" +original_id: client-libraries +--- + +Pulsar supports the following client libraries: + +- [Java client](client-libraries-java.md) +- [Go client](client-libraries-go.md) +- [Python client](client-libraries-python.md) +- [C++ client](client-libraries-cpp.md) +- [Node.js client](client-libraries-node.md) +- [WebSocket client](client-libraries-websocket.md) +- [C# client](client-libraries-dotnet.md) + +## Feature matrix +Pulsar client feature matrix for different languages is listed on [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + +## Third-party clients + +Besides the official released clients, multiple projects on developing Pulsar clients are available in different languages. + +> If you have developed a new Pulsar client, feel free to submit a pull request and add your client to the list below. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| Go | [pulsar-client-go](https://github.com/Comcast/pulsar-client-go) | [Comcast](https://github.com/Comcast) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | +| Go | [go-pulsar](https://github.com/t2y/go-pulsar) | [t2y](https://github.com/t2y) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | +| Haskell | [supernova](https://github.com/cr-org/supernova) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Native Pulsar client for Haskell | +| Scala | [neutron](https://github.com/cr-org/neutron) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Purely functional Apache Pulsar client for Scala built on top of Fs2 | +| Scala | [pulsar4s](https://github.com/sksamuel/pulsar4s) | [sksamuel](https://github.com/sksamuel) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Idomatic, typesafe, and reactive Scala client for Apache Pulsar | +| Rust | [pulsar-rs](https://github.com/wyyerd/pulsar-rs) | [Wyyerd Group](https://github.com/wyyerd) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Future-based Rust bindings for Apache Pulsar | +| .NET | [pulsar-client-dotnet](https://github.com/fsharplang-ru/pulsar-client-dotnet) | [Lanayx](https://github.com/Lanayx) | [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native .NET client for C#/F#/VB | diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-architecture-overview.md b/site2/website/versioned_docs/version-2.8.x/concepts-architecture-overview.md new file mode 100644 index 0000000000000..f3e75c3e307e0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-architecture-overview.md @@ -0,0 +1,172 @@ +--- +id: concepts-architecture-overview +title: Architecture Overview +sidebar_label: "Architecture" +original_id: concepts-architecture-overview +--- + +At the highest level, a Pulsar instance is composed of one or more Pulsar clusters. Clusters within an instance can [replicate](concepts-replication.md) data amongst themselves. + +In a Pulsar cluster: + +* One or more brokers handles and load balances incoming messages from producers, dispatches messages to consumers, communicates with the Pulsar configuration store to handle various coordination tasks, stores messages in BookKeeper instances (aka bookies), relies on a cluster-specific ZooKeeper cluster for certain tasks, and more. +* A BookKeeper cluster consisting of one or more bookies handles [persistent storage](#persistent-storage) of messages. +* A ZooKeeper cluster specific to that cluster handles coordination tasks between Pulsar clusters. + +The diagram below provides an illustration of a Pulsar cluster: + +![Pulsar architecture diagram](/assets/pulsar-system-architecture.png) + +At the broader instance level, an instance-wide ZooKeeper cluster called the configuration store handles coordination tasks involving multiple clusters, for example [geo-replication](concepts-replication.md). + +## Brokers + +The Pulsar message broker is a stateless component that's primarily responsible for running two other components: + +* An HTTP server that exposes a {@inject: rest:REST:/} API for both administrative tasks and [topic lookup](concepts-clients.md#client-setup-phase) for producers and consumers. The producers connect to the brokers to publish messages and the consumers connect to the brokers to consume the messages. +* A dispatcher, which is an asynchronous TCP server over a custom [binary protocol](developing-binary-protocol.md) used for all data transfers + +Messages are typically dispatched out of a [managed ledger](#managed-ledgers) cache for the sake of performance, *unless* the backlog exceeds the cache size. If the backlog grows too large for the cache, the broker will start reading entries from BookKeeper. + +Finally, to support geo-replication on global topics, the broker manages replicators that tail the entries published in the local region and republish them to the remote region using the Pulsar [Java client library](client-libraries-java.md). + +> For a guide to managing Pulsar brokers, see the [brokers](admin-api-brokers.md) guide. + +## Clusters + +A Pulsar instance consists of one or more Pulsar *clusters*. Clusters, in turn, consist of: + +* One or more Pulsar [brokers](#brokers) +* A ZooKeeper quorum used for cluster-level configuration and coordination +* An ensemble of bookies used for [persistent storage](#persistent-storage) of messages + +Clusters can replicate amongst themselves using [geo-replication](concepts-replication.md). + +> For a guide to managing Pulsar clusters, see the [clusters](admin-api-clusters.md) guide. + +## Metadata store + +The Pulsar metadata store maintains all the metadata of a Pulsar cluster, such as topic metadata, schema, broker load data, and so on. Pulsar uses [Apache ZooKeeper](https://zookeeper.apache.org/) for metadata storage, cluster configuration, and coordination. The Pulsar metadata store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. You can use one ZooKeeper cluster for both Pulsar metadata store and BookKeeper metadata store. If you want to deploy Pulsar brokers connected to an existing BookKeeper cluster, you need to deploy separate ZooKeeper clusters for Pulsar metadata store and BookKeeper metadata store respectively. + +In a Pulsar instance: + +* A configuration store quorum stores configuration for tenants, namespaces, and other entities that need to be globally consistent. +* Each cluster has its own local ZooKeeper ensemble that stores cluster-specific configuration and coordination such as which brokers are responsible for which topics as well as ownership metadata, broker load reports, BookKeeper ledger metadata, and more. + +## Configuration store + +The configuration store maintains all the configurations of a Pulsar instance, such as clusters, tenants, namespaces, partitioned topic related configurations, and so on. A Pulsar instance can have a single local cluster, multiple local clusters, or multiple cross-region clusters. Consequently, the configuration store can share the configurations across multiple clusters under a Pulsar instance. The configuration store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. + +## Persistent storage + +Pulsar provides guaranteed message delivery for applications. If a message successfully reaches a Pulsar broker, it will be delivered to its intended target. + +This guarantee requires that non-acknowledged messages are stored in a durable manner until they can be delivered to and acknowledged by consumers. This mode of messaging is commonly called *persistent messaging*. In Pulsar, N copies of all messages are stored and synced on disk, for example 4 copies across two servers with mirrored [RAID](https://en.wikipedia.org/wiki/RAID) volumes on each server. + +### Apache BookKeeper + +Pulsar uses a system called [Apache BookKeeper](http://bookkeeper.apache.org/) for persistent message storage. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) (WAL) system that provides a number of crucial advantages for Pulsar: + +* It enables Pulsar to utilize many independent logs, called [ledgers](#ledgers). Multiple ledgers can be created for topics over time. +* It offers very efficient storage for sequential data that handles entry replication. +* It guarantees read consistency of ledgers in the presence of various system failures. +* It offers even distribution of I/O across bookies. +* It's horizontally scalable in both capacity and throughput. Capacity can be immediately increased by adding more bookies to a cluster. +* Bookies are designed to handle thousands of ledgers with concurrent reads and writes. By using multiple disk devices---one for journal and another for general storage--bookies are able to isolate the effects of read operations from the latency of ongoing write operations. + +In addition to message data, *cursors* are also persistently stored in BookKeeper. Cursors are [subscription](reference-terminology.md#subscription) positions for [consumers](reference-terminology.md#consumer). BookKeeper enables Pulsar to store consumer position in a scalable fashion. + +At the moment, Pulsar supports persistent message storage. This accounts for the `persistent` in all topic names. Here's an example: + +```http + +persistent://my-tenant/my-namespace/my-topic + +``` + +> Pulsar also supports ephemeral ([non-persistent](concepts-messaging.md#non-persistent-topics)) message storage. + + +You can see an illustration of how brokers and bookies interact in the diagram below: + +![Brokers and bookies](/assets/broker-bookie.png) + + +### Ledgers + +A ledger is an append-only data structure with a single writer that is assigned to multiple BookKeeper storage nodes, or bookies. Ledger entries are replicated to multiple bookies. Ledgers themselves have very simple semantics: + +* A Pulsar broker can create a ledger, append entries to the ledger, and close the ledger. +* After the ledger has been closed---either explicitly or because the writer process crashed---it can then be opened only in read-only mode. +* Finally, when entries in the ledger are no longer needed, the whole ledger can be deleted from the system (across all bookies). + +#### Ledger read consistency + +The main strength of Bookkeeper is that it guarantees read consistency in ledgers in the presence of failures. Since the ledger can only be written to by a single process, that process is free to append entries very efficiently, without need to obtain consensus. After a failure, the ledger will go through a recovery process that will finalize the state of the ledger and establish which entry was last committed to the log. After that point, all readers of the ledger are guaranteed to see the exact same content. + +#### Managed ledgers + +Given that Bookkeeper ledgers provide a single log abstraction, a library was developed on top of the ledger called the *managed ledger* that represents the storage layer for a single topic. A managed ledger represents the abstraction of a stream of messages with a single writer that keeps appending at the end of the stream and multiple cursors that are consuming the stream, each with its own associated position. + +Internally, a single managed ledger uses multiple BookKeeper ledgers to store the data. There are two reasons to have multiple ledgers: + +1. After a failure, a ledger is no longer writable and a new one needs to be created. +2. A ledger can be deleted when all cursors have consumed the messages it contains. This allows for periodic rollover of ledgers. + +### Journal storage + +In BookKeeper, *journal* files contain BookKeeper transaction logs. Before making an update to a [ledger](#ledgers), a bookie needs to ensure that a transaction describing the update is written to persistent (non-volatile) storage. A new journal file is created once the bookie starts or the older journal file reaches the journal file size threshold (configured using the [`journalMaxSizeMB`](reference-configuration.md#bookkeeper-journalMaxSizeMB) parameter). + +## Pulsar proxy + +One way for Pulsar clients to interact with a Pulsar [cluster](#clusters) is by connecting to Pulsar message [brokers](#brokers) directly. In some cases, however, this kind of direct connection is either infeasible or undesirable because the client doesn't have direct access to broker addresses. If you're running Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, for example, then direct client connections to brokers are likely not possible. + +The **Pulsar proxy** provides a solution to this problem by acting as a single gateway for all of the brokers in a cluster. If you run the Pulsar proxy (which, again, is optional), all client connections with the Pulsar cluster will flow through the proxy rather than communicating with brokers. + +> For the sake of performance and fault tolerance, you can run as many instances of the Pulsar proxy as you'd like. + +Architecturally, the Pulsar proxy gets all the information it requires from ZooKeeper. When starting the proxy on a machine, you only need to provide ZooKeeper connection strings for the cluster-specific and instance-wide configuration store clusters. Here's an example: + +```bash + +$ bin/pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk-2 \ + --configuration-store-servers zk-0,zk-1,zk-2 + +``` + +> #### Pulsar proxy docs +> For documentation on using the Pulsar proxy, see the [Pulsar proxy admin documentation](administration-proxy.md). + + +Some important things to know about the Pulsar proxy: + +* Connecting clients don't need to provide *any* specific configuration to use the Pulsar proxy. You won't need to update the client configuration for existing applications beyond updating the IP used for the service URL (for example if you're running a load balancer over the Pulsar proxy). +* [TLS encryption](security-tls-transport.md) and [authentication](security-tls-authentication.md) is supported by the Pulsar proxy + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. Pulsar provides a built-in service discovery mechanism that you can set up using the instructions in the [Deploying a Pulsar instance](deploy-bare-metal.md#service-discovery-setup) guide. + +You can use your own service discovery system if you'd like. If you use your own system, there is just one requirement: when a client performs an HTTP request to an endpoint, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +The diagram below illustrates Pulsar service discovery: + +![alt-text](/assets/pulsar-service-discovery.png) + +In this diagram, the Pulsar cluster is addressable via a single DNS name: `pulsar-cluster.acme.com`. A [Python client](client-libraries-python.md), for example, could access this Pulsar cluster like this: + +```python + +from pulsar import Client + +client = Client('pulsar://pulsar-cluster.acme.com:6650') + +``` + +:::note + +In Pulsar, each topic is handled by only one broker. Initial requests from a client to read, update or delete a topic are sent to a broker that may not be the topic owner. If the broker cannot handle the request for this topic, it redirects the request to the appropriate broker. + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-authentication.md b/site2/website/versioned_docs/version-2.8.x/concepts-authentication.md new file mode 100644 index 0000000000000..f6307890c904a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-authentication.md @@ -0,0 +1,9 @@ +--- +id: concepts-authentication +title: Authentication and Authorization +sidebar_label: "Authentication and Authorization" +original_id: concepts-authentication +--- + +Pulsar supports a pluggable [authentication](security-overview.md) mechanism which can be configured at the proxy and/or the broker. Pulsar also supports a pluggable [authorization](security-authorization.md) mechanism. These mechanisms work together to identify the client and its access rights on topics, namespaces and tenants. + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-clients.md b/site2/website/versioned_docs/version-2.8.x/concepts-clients.md new file mode 100644 index 0000000000000..4040624f7d636 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-clients.md @@ -0,0 +1,92 @@ +--- +id: concepts-clients +title: Pulsar Clients +sidebar_label: "Clients" +original_id: concepts-clients +--- + +Pulsar exposes a client API with language bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md), [C++](client-libraries-cpp.md) and [C#](client-libraries-dotnet.md). The client API optimizes and encapsulates Pulsar's client-broker communication protocol and exposes a simple and intuitive API for use by applications. + +Under the hood, the current official Pulsar client libraries support transparent reconnection and/or connection failover to brokers, queuing of messages until acknowledged by the broker, and heuristics such as connection retries with backoff. + +> **Custom client libraries** +> If you'd like to create your own client library, we recommend consulting the documentation on Pulsar's custom [binary protocol](developing-binary-protocol.md). + + +## Client setup phase + +Before an application creates a producer/consumer, the Pulsar client library needs to initiate a setup phase including two steps: + +1. The client attempts to determine the owner of the topic by sending an HTTP lookup request to the broker. The request could reach one of the active brokers which, by looking at the (cached) zookeeper metadata knows who is serving the topic or, in case nobody is serving it, tries to assign it to the least loaded broker. +1. Once the client library has the broker address, it creates a TCP connection (or reuse an existing connection from the pool) and authenticates it. Within this connection, client and broker exchange binary commands from a custom protocol. At this point the client sends a command to create producer/consumer to the broker, which will comply after having validated the authorization policy. + +Whenever the TCP connection breaks, the client immediately re-initiates this setup phase and keeps trying with exponential backoff to re-establish the producer or consumer until the operation succeeds. + +## Reader interface + +In Pulsar, the "standard" [consumer interface](concepts-messaging.md#consumers) involves using consumers to listen on [topics](reference-terminology.md#topic), process incoming messages, and finally acknowledge those messages when they are processed. Whenever a new subscription is created, it is initially positioned at the end of the topic (by default), and consumers associated with that subscription begin reading with the first message created afterwards. Whenever a consumer connects to a topic using a pre-existing subscription, it begins reading from the earliest message un-acked within that subscription. In summary, with the consumer interface, subscription cursors are automatically managed by Pulsar in response to [message acknowledgements](concepts-messaging.md#acknowledgement). + +The **reader interface** for Pulsar enables applications to manually manage cursors. When you use a reader to connect to a topic---rather than a consumer---you need to specify *which* message the reader begins reading from when it connects to a topic. When connecting to a topic, the reader interface enables you to begin with: + +* The **earliest** available message in the topic +* The **latest** available message in the topic +* Some other message between the earliest and the latest. If you select this option, you'll need to explicitly provide a message ID. Your application will be responsible for "knowing" this message ID in advance, perhaps fetching it from a persistent data store or cache. + +The reader interface is helpful for use cases like using Pulsar to provide effectively-once processing semantics for a stream processing system. For this use case, it's essential that the stream processing system be able to "rewind" topics to a specific message and begin reading there. The reader interface provides Pulsar clients with the low-level abstraction necessary to "manually position" themselves within a topic. + +Internally, the reader interface is implemented as a consumer using an exclusive, non-durable subscription to the topic with a randomly-allocated name. + +[ **IMPORTANT** ] + +Unlike subscription/consumer, readers are non-durable in nature and does not prevent data in a topic from being deleted, thus it is ***strongly*** advised that [data retention](cookbooks-retention-expiry.md) be configured. If data retention for a topic is not configured for an adequate amount of time, messages that the reader has not yet read might be deleted . This causes the readers to essentially skip messages. Configuring the data retention for a topic guarantees the reader with a certain duration to read a message. + +Please also note that a reader can have a "backlog", but the metric is only used for users to know how behind the reader is. The metric is not considered for any backlog quota calculations. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-reader-consumer-interfaces.png) + +Here's a Java example that begins reading from the earliest available message on a topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageId; +import org.apache.pulsar.client.api.Reader; + +// Create a reader on a topic and for a specific message (and onward) +Reader reader = pulsarClient.newReader() + .topic("reader-api-test") + .startMessageId(MessageId.earliest) + .create(); + +while (true) { + Message message = reader.readNext(); + + // Process the message +} + +``` + +To create a reader that reads from the latest available message: + +```java + +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.latest) + .create(); + +``` + +To create a reader that reads from some message between the earliest and the latest: + +```java + +byte[] msgIdBytes = // Some byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-messaging.md b/site2/website/versioned_docs/version-2.8.x/concepts-messaging.md new file mode 100644 index 0000000000000..c388fc9ca4cf7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-messaging.md @@ -0,0 +1,700 @@ +--- +id: concepts-messaging +title: Messaging +sidebar_label: "Messaging" +original_id: concepts-messaging +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern (often abbreviated to pub-sub). In this pattern, [producers](#producers) publish messages to [topics](#topics). [Consumers](#consumers) [subscribe](#subscription-types) to those topics, process incoming messages, and send an acknowledgement when processing is complete. + +When a subscription is created, Pulsar [retains](concepts-architecture-overview.md#persistent-storage) all messages, even if the consumer is disconnected. Retained messages are discarded only when a consumer acknowledges that those messages are processed successfully. + +## Messages + +Messages are the basic "unit" of Pulsar. The following table lists the components of messages. + +Component | Description +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages contain raw bytes, although message data can also conform to data [schemas](schema-get-started.md). +Key | Messages are optionally tagged with keys, which is useful for things like [topic compaction](concepts-topic-compaction.md). +Properties | An optional key/value map of user-defined properties. +Producer name | The name of the producer who produces the message. If you do not specify a producer name, the default name is used. +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. The sequence ID of the message is its order in that sequence. +Publish time | The timestamp of when the message is published. The timestamp is automatically applied by the producer. +Event time | An optional timestamp attached to a message by applications. For example, applications attach a timestamp on when the message is processed. If nothing is set to event time, the value is `0`. +TypedMessageBuilder | It is used to construct a message. You can set message properties such as the message key, message value with `TypedMessageBuilder`.
    When you set `TypedMessageBuilder`, set the key as a string. If you set the key as other types, for example, an AVRO object, the key is sent as bytes, and it is difficult to get the AVRO object back on the consumer. + +The default size of a message is 5 MB. You can configure the max size of a message with the following configurations. + +- In the `broker.conf` file. + + ```bash + + # The max size of a message (in bytes). + maxMessageSize=5242880 + + ``` + +- In the `bookkeeper.conf` file. + + ```bash + + # The max size of the netty frame (in bytes). Any messages received larger than this value are rejected. The default value is 5 MB. + nettyMaxFrameSizeBytes=5253120 + + ``` + +> For more information on Pulsar message contents, see Pulsar [binary protocol](developing-binary-protocol.md). + +## Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar [broker](reference-terminology.md#broker). The Pulsar broker process the messages. + +### Send modes + +Producers send messages to brokers synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|-----------| +| Sync send | The producer waits for an acknowledgement from the broker after sending every message. If the acknowledgment is not received, the producer treats the sending operation as a failure. | +| Async send | The producer puts a message in a blocking queue and returns immediately. The client library sends the message to the broker in the background. If the queue is full (you can [configure](reference-configuration.md#broker) the maximum size), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. | + +### Access mode + +You can have different types of access modes on topics for producers. + +|Access mode | Description +|---|--- +`Shared`|Multiple producers can publish on a topic.

    This is the **default** setting. +`Exclusive`|Only one producer can publish on a topic.

    If there is already a producer connected, other producers trying to publish on this topic get errors immediately.

    The “old” producer is evicted and a “new” producer is selected to be the next exclusive producer if the “old” producer experiences a network partition with the broker. +`WaitForExclusive`|If there is already a producer connected, the producer creation is pending (rather than timing out) until the producer gets the `Exclusive` access.

    The producer that succeeds in becoming the exclusive one is treated as the leader. Consequently, if you want to implement the leader election scheme for your application, you can use this access mode. + +:::note + +Once an application creates a producer with the `Exclusive` or `WaitForExclusive` access mode successfully, the instance of the application is guaranteed to be the **only one writer** on the topic. Other producers trying to produce on this topic get errors immediately or have to wait until they get the `Exclusive` access. +For more information, see [PIP 68: Exclusive Producer](https://github.com/apache/pulsar/wiki/PIP-68:-Exclusive-Producer). + +::: + +You can set producer access mode through Java Client API. For more information, see `ProducerAccessMode` in [ProducerBuilder.java](https://github.com/apache/pulsar/blob/fc5768ca3bbf92815d142fe30e6bfad70a1b4fc6/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/ProducerBuilder.java). + + +### Compression + +You can compress messages published by producers during transportation. Pulsar currently supports the following types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) +* [ZSTD](https://facebook.github.io/zstd/) +* [SNAPPY](https://google.github.io/snappy/) + +### Batching + +When batching is enabled, the producer accumulates and sends a batch of messages in a single request. The batch size is defined by the maximum number of messages and the maximum publish latency. Therefore, the backlog size represents the total number of batches instead of the total number of messages. + +In Pulsar, batches are tracked and stored as single units rather than as individual messages. Consumer unbundles a batch into individual messages. However, scheduled messages (configured through the `deliverAt` or the `deliverAfter` parameter) are always sent as individual messages even batching is enabled. + +In general, a batch is acknowledged when all of its messages are acknowledged by a consumer. It means unexpected failures, negative acknowledgements, or acknowledgement timeouts can result in redelivery of all messages in a batch, even if some of the messages are acknowledged. + +To avoid redelivering acknowledged messages in a batch to the consumer, Pulsar introduces batch index acknowledgement since Pulsar 2.6.0. When batch index acknowledgement is enabled, the consumer filters out the batch index that has been acknowledged and sends the batch index acknowledgement request to the broker. The broker maintains the batch index acknowledgement status and tracks the acknowledgement status of each batch index to avoid dispatching acknowledged messages to the consumer. When all indexes of the batch message are acknowledged, the batch message is deleted. + +By default, batch index acknowledgement is disabled (`acknowledgmentAtBatchIndexLevelEnabled=false`). You can enable batch index acknowledgement by setting the `acknowledgmentAtBatchIndexLevelEnabled` parameter to `true` at the broker side. Enabling batch index acknowledgement results in more memory overheads. + +### Chunking +When you enable chunking, read the following instructions. +- Batching and chunking cannot be enabled simultaneously. To enable chunking, you must disable batching in advance. +- Chunking is only supported for persisted topics. +- Chunking is only supported for the exclusive and failover subscription types. + +When chunking is enabled (`chunkingEnabled=true`), if the message size is greater than the allowed maximum publish-payload size, the producer splits the original message into chunked messages and publishes them with chunked metadata to the broker separately and in order. At the broker side, the chunked messages are stored in the managed-ledger in the same way as that of ordinary messages. The only difference is that the consumer needs to buffer the chunked messages and combines them into the real message when all chunked messages have been collected. The chunked messages in the managed-ledger can be interwoven with ordinary messages. If producer fails to publish all the chunks of a message, the consumer can expire incomplete chunks if consumer fail to receive all chunks in expire time. By default, the expire time is set to one minute. + +The consumer consumes the chunked messages and buffers them until the consumer receives all the chunks of a message. And then the consumer stitches chunked messages together and places them into the receiver-queue. Clients consume messages from the receiver-queue. Once the consumer consumes the entire large message and acknowledges it, the consumer internally sends acknowledgement of all the chunk messages associated to that large message. You can set the `maxPendingChunkedMessage` parameter on the consumer. When the threshold is reached, the consumer drops the unchunked messages by silently acknowledging them or asking the broker to redeliver them later by marking them unacknowledged. + +The broker does not require any changes to support chunking for non-shared subscription. The broker only uses `chunkedMessageRate` to record chunked message rate on the topic. + +#### Handle chunked messages with one producer and one ordered consumer + +As shown in the following figure, when a topic has one producer which publishes large message payload in chunked messages along with regular non-chunked messages. The producer publishes message M1 in three chunks M1-C1, M1-C2 and M1-C3. The broker stores all the three chunked messages in the managed-ledger and dispatches to the ordered (exclusive/failover) consumer in the same order. The consumer buffers all the chunked messages in memory until it receives all the chunked messages, combines them into one message and then hands over the original message M1 to the client. + +![](/assets/chunking-01.png) + +#### Handle chunked messages with multiple producers and one ordered consumer + +When multiple publishers publish chunked messages into a single topic, the broker stores all the chunked messages coming from different publishers in the same managed-ledger. As shown below, Producer 1 publishes message M1 in three chunks M1-C1, M1-C2 and M1-C3. Producer 2 publishes message M2 in three chunks M2-C1, M2-C2 and M2-C3. All chunked messages of the specific message are still in order but might not be consecutive in the managed-ledger. This brings some memory pressure to the consumer because the consumer keeps separate buffer for each large message to aggregate all chunks of the large message and combine them into one message. + +![](/assets/chunking-02.png) + +## Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +A consumer sends a [flow permit request](developing-binary-protocol.md#flow-control) to a broker to get messages. There is a queue at the consumer side to receive messages pushed from the broker. You can configure the queue size with the [`receiverQueueSize`](client-libraries-java.md#configure-consumer) parameter. The default size is `1000`). Each time `consumer.receive()` is called, a message is dequeued from the buffer. + +### Receive modes + +Messages are received from [brokers](reference-terminology.md#broker) either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive is blocked until a message is available. | +| Async receive | An async receive returns immediately with a future value—for example, a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java—that completes once a new message is available. | + +### Listeners + +Client libraries provide listener implementation for consumers. For example, the [Java client](client-libraries-java.md) provides a {@inject: javadoc:MesssageListener:/client/org/apache/pulsar/client/api/MessageListener} interface. In this interface, the `received` method is called whenever a new message is received. + +### Acknowledgement + +When a consumer consumes a message successfully, the consumer sends an acknowledgement request to the broker. This message is permanently stored, and then deleted only after all the subscriptions have acknowledged it. If you want to store the message that has been acknowledged by a consumer, you need to configure the [message retention policy](concepts-messaging.md#message-retention-and-expiry). + +For a batch message, if batch index acknowledgement is enabled, the broker maintains the batch index acknowledgement status and tracks the acknowledgement status of each batch index to avoid dispatching acknowledged messages to the consumer. When all indexes of the batch message are acknowledged, the batch message is deleted. For details about the batch index acknowledgement, see [batching](#batching). + +Messages can be acknowledged in the following two ways: + +- Messages are acknowledged individually. With individual acknowledgement, the consumer needs to acknowledge each message and sends an acknowledgement request to the broker. +- Messages are acknowledged cumulatively. With cumulative acknowledgement, the consumer only needs to acknowledge the last message it received. All messages in the stream up to (and including) the provided message are not re-delivered to that consumer. + +:::note + +Cumulative acknowledgement cannot be used in [Shared subscription type](#subscription-types), because this subscription type involves multiple consumers which have access to the same subscription. In Shared subscription type, messages are acknowledged individually. + +::: + +### Negative acknowledgement + +When a consumer does not consume a message successfully at a time, and wants to consume the message again, the consumer sends a negative acknowledgement to the broker, and then the broker redelivers the message. + +Messages are negatively acknowledged either individually or cumulatively, depending on the consumption subscription type. + +In the exclusive and failover subscription types, consumers only negatively acknowledge the last message they receive. + +In the shared and Key_Shared subscription types, you can negatively acknowledge messages individually. + +Be aware that negative acknowledgment on ordered subscription types, such as Exclusive, Failover and Key_Shared, can cause failed messages to arrive consumers out of the original order. + +:::note + +If batching is enabled, other messages and the negatively acknowledged messages in the same batch are redelivered to the consumer. + +::: + +### Acknowledgement timeout + +If a message is not consumed successfully, and you want to trigger the broker to redeliver the message automatically, you can adopt the unacknowledged message automatic re-delivery mechanism. Client tracks the unacknowledged messages within the entire `acktimeout` time range, and sends a `redeliver unacknowledged messages` request to the broker automatically when the acknowledgement timeout is specified. + +:::note + +If batching is enabled, other messages and the unacknowledged messages in the same batch are redelivered to the consumer. + +::: + +:::note + +Prefer negative acknowledgements over acknowledgement timeout. Negative acknowledgement controls the re-delivery of individual messages with more precision, and avoids invalid redeliveries when the message processing time exceeds the acknowledgement timeout. + +::: + +### Dead letter topic + +Dead letter topic enables you to consume new messages when some messages cannot be consumed successfully by a consumer. In this mechanism, messages that are failed to be consumed are stored in a separate topic, which is called dead letter topic. You can decide how to handle messages in the dead letter topic. + +The following example shows how to enable dead letter topic in a Java client using the default dead letter topic: + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` + +The default dead letter topic uses this format: + +``` + +--DLQ + +``` + + +If you want to specify the name of the dead letter topic, use this Java client example: + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("your-topic-name") + .build()) + .subscribe(); + +``` + +Dead letter topic depends on message re-delivery. Messages are redelivered either due to [acknowledgement timeout](#acknowledgement-timeout) or [negative acknowledgement](#negative-acknowledgement). If you are going to use negative acknowledgement on a message, make sure it is negatively acknowledged before the acknowledgement timeout. + +:::note + +Currently, dead letter topic is enabled In the shared and Key_Shared subscription types. + +::: + +### Retry letter topic + +For many online business systems, a message is re-consumed due to exception occurs in the business logic processing. To configure the delay time for re-consuming the failed messages, you can configure the producer to send messages to both the business topic and the retry letter topic, and enable automatic retry on the consumer. When automatic retry is enabled on the consumer, a message is stored in the retry letter topic if the messages are not consumed, and therefore the consumer automatically consumes the failed messages from the retry letter topic after a specified delay time. + +By default, automatic retry is disabled. You can set `enableRetry` to `true` to enable automatic retry on the consumer. + +This example shows how to consume messages from a retry letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .enableRetry(true) + .receiverQueueSize(100) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .retryLetterTopic("persistent://my-property/my-ns/my-subscription-custom-Retry") + .build()) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +``` + +## Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from producers to consumers. Topic names are URLs that have a well-defined structure: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](concepts-architecture-overview.md#persistent-storage) and [non-persistent](#non-persistent-topics). The default is persistent, so if you do not specify a type, the topic is persistent. With persistent topics, all messages are durably persisted on disks (if the broker is not standalone, messages are durably persisted on multiple disks), whereas data for non-persistent topics is not persisted to storage disks. +`tenant` | The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespaces) level. Each tenant has one or multiple namespaces. +`topic` | The final part of the name. Topic names have no special meaning in a Pulsar instance. + +> **No need to explicitly create new topics** +> You do not need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar creates that topic under the namespace provided in the [topic name](#topics) automatically. +> If no tenant or namespace is specified when a client creates a topic, the topic is created in the default tenant and namespace. You can also create a topic in a specified tenant and namespace, such as `persistent://my-tenant/my-namespace/my-topic`. `persistent://my-tenant/my-namespace/my-topic` means the `my-topic` topic is created in the `my-namespace` namespace of the `my-tenant` tenant. + +## Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant creates multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +## Subscriptions + +A subscription is a named configuration rule that determines how messages are delivered to consumers. Four subscription types are available in Pulsar: [exclusive](#exclusive), [shared](#shared), [failover](#failover), and [key_shared](#key_shared). These types are illustrated in the figure below. + +![Subscription types](/assets/pulsar-subscription-types.png) + +> **Pub-Sub or Queuing** +> In Pulsar, you can use different subscriptions flexibly. +> * If you want to achieve traditional "fan-out pub-sub messaging" among consumers, specify a unique subscription name for each consumer. It is exclusive subscription type. +> * If you want to achieve "message queuing" among consumers, share the same subscription name among multiple consumers(shared, failover, key_shared). +> * If you want to achieve both effects simultaneously, combine exclusive subscription type with other subscription types for consumers. + +### Subscription types +When a subscription has no consumers, its subscription type is undefined. The type of a subscription is defined when a consumer connects to it, and the type can be changed by restarting all consumers with a different configuration. + +#### Exclusive + +In *exclusive* type, only a single consumer is allowed to attach to the subscription. If multiple consumers subscribe to a topic using the same subscription, an error occurs. + +In the diagram below, only **Consumer A-0** is allowed to consume messages. + +> Exclusive is the default subscription type. + +![Exclusive subscriptions](/assets/pulsar-exclusive-subscriptions.png) + +#### Failover + +In *Failover* type, multiple consumers can attach to the same subscription. A master consumer is picked for non-partitioned topic or each partition of partitioned topic and receives messages. When the master consumer disconnects, all (non-acknowledged and subsequent) messages are delivered to the next consumer in line. + +For partitioned topics, broker will sort consumers by priority level and lexicographical order of consumer name. Then broker will try to evenly assigns topics to consumers with the highest priority level. + +For non-partitioned topic, broker will pick consumer in the order they subscribe to the non partitioned topic. + +In the diagram below, **Consumer-B-0** is the master consumer while **Consumer-B-1** would be the next consumer in line to receive messages if **Consumer-B-0** is disconnected. + +![Failover subscriptions](/assets/pulsar-failover-subscriptions.png) + +#### Shared + +In *shared* or *round robin* mode, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram below, **Consumer-C-1** and **Consumer-C-2** are able to subscribe to the topic, but **Consumer-C-3** and others could as well. + +> **Limitations of Shared type** +> When using Shared type, be aware that: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with Shared type. + +![Shared subscriptions](/assets/pulsar-shared-subscriptions.png) + +#### Key_Shared + +In *Key_Shared* type, multiple consumers can attach to the same subscription. Messages are delivered in a distribution across consumers and message with same key or same ordering key are delivered to only one consumer. No matter how many times the message is re-delivered, it is delivered to the same consumer. When a consumer connected or disconnected will cause served consumer change for some key of message. + +> **Limitations of Key_Shared type** +> When you use Key_Shared type, be aware that: +> * You need to specify a key or orderingKey for messages. +> * You cannot use cumulative acknowledgment with Key_Shared type. +> * Your producers should disable batching or use a key-based batch builder. + +![Key_Shared subscriptions](/assets/pulsar-key-shared-subscriptions.png) + +**You can disable Key_Shared subscription in the `broker.config` file.** + +### Subscription modes + +#### What is a subscription mode + +The subscription mode indicates the cursor type. + +- When a subscription is created, an associated cursor is created to record the last consumed position. +- When a consumer of the subscription restarts, it can continue consuming from the last message it consumes. + +Subscription mode | Description | Note +|---|---|--- +`Durable`|The cursor is durable, which retains messages and persists the current position.

    If a broker restarts from a failure, it can recover the cursor from the persistent storage (BookKeeper), so that messages can continue to be consumed from the last consumed position.|`Durable` is the **default** subscription mode. +`NonDurable`|The cursor is non-durable.

    Once a broker stops, the cursor is lost and can never be recovered, so that messages **can not** continue to be consumed from the last consumed position.|Reader’s subscription mode is `NonDurable` in nature and it does not prevent data in a topic from being deleted. Reader’s subscription mode **can not** be changed. + +A [subscription](#subscriptions) can have one or more consumers. When a consumer subscribes to a topic, it must specify the subscription name. A durable subscription and a non-durable subscription can have the same name, they are independent of each other. If a consumer specifies a subscription which does not exist before, the subscription is automatically created. + +#### When to use + +By default, messages of a topic without any durable subscriptions are marked as deleted. If you want to prevent the messages being marked as deleted, you can create a durable subscription for this topic. In this case, only acknowledged messages are marked as deleted. For more information, see [message retention and expiry](cookbooks-retention-expiry.md). + +#### How to use + +After a consumer is created, the default subscription mode of the consumer is `Durable`. You can change the subscription mode to `NonDurable` by making changes to the consumer’s configuration. + +````mdx-code-block + + + + +```java + + Consumer consumer = pulsarClient.newConsumer() + .topic("my-topic") + .subscriptionName("my-sub") + .subscriptionMode(SubscriptionMode.Durable) + .subscribe(); + +``` + + + + +```java + + Consumer consumer = pulsarClient.newConsumer() + .topic("my-topic") + .subscriptionName("my-sub") + .subscriptionMode(SubscriptionMode.NonDurable) + .subscribe(); + +``` + + + + +```` + +For how to create, check, or delete a durable subscription, see [manage subscriptions](admin-api-topics.md/#manage-subscriptions). + +## Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces). + +When subscribing to multiple topics, the Pulsar client automatically makes a call to the Pulsar API to discover the topics that match the regex pattern/list, and then subscribe to all of them. If any of the topics do not exist, the consumer auto-subscribes to them once the topics are created. + +> **No ordering guarantees across multiple topics** +> When a producer sends messages to a single topic, all messages are guaranteed to be read from that topic in the same order. However, these guarantees do not hold across multiple topics. So when a producer sends message to multiple topics, the order in which messages are read from those topics is not guaranteed to be the same. + +The following are multi-topic subscription examples for Java. + +```java + +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(allTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(someTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +``` + +For code examples, see [Java](client-libraries-java.md#multi-topic-subscriptions). + +## Partitioned topics + +Normal topics are served only by a single broker, which limits the maximum throughput of the topic. *Partitioned topics* are a special type of topic that are handled by multiple brokers, thus allowing for higher throughput. + +A partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](/assets/partitioning.png) + +The **Topic1** topic has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines each message should be published to which partition, while the [subscription type](#subscription-types) determines which messages go to which consumers. + +Decisions about routing and subscription types can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription types work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} available: + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer will randomly pick one single partition and publish all the messages into that partition. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. +`CustomPartition` | Use custom message router implementation that will be called to determine the partition for a particular message. User can create a custom routing mode by using the [Java client](client-libraries-java.md) and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +### Ordering guarantee + +The ordering of messages is related to MessageRoutingMode and Message Key. Usually, user would want an ordering of Per-key-partition guarantee. + +If there is a key attached to message, the messages will be routed to corresponding partitions based on the hashing scheme specified by {@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} in {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder}, when using either `SinglePartition` or `RoundRobinPartition` mode. + +Ordering guarantee | Description | Routing Mode and Key +:------------------|:------------|:------------ +Per-key-partition | All the messages with the same key will be in order and be placed in same partition. | Use either `SinglePartition` or `RoundRobinPartition` mode, and Key is provided by each message. +Per-producer | All the messages from the same producer will be in order. | Use `SinglePartition` mode, and no Key is provided for each message. + +### Hashing scheme + +{@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} is an enum that represent sets of standard hashing functions available when choosing the partition to use for a particular message. + +There are 2 types of standard hashing functions available: `JavaStringHash` and `Murmur3_32Hash`. +The default hashing function for producer is `JavaStringHash`. +Please pay attention that `JavaStringHash` is not useful when producers can be from different multiple language clients, under this use case, it is recommended to use `Murmur3_32Hash`. + + + +## Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](concepts-architecture-overview.md#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](concepts-architecture-overview.md#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the `pulsar-admin topics` command. For more information, see [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). + +### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is delivered to connected brokers. Producers thus see comparatively low publish latency with non-persistent topic. + +### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription types---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumers) for a non-persistent topic: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.newConsumer() + .topic(npTopic) + .subscriptionName(subscriptionName) + .subscribe(); + +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java + +Producer producer = client.newProducer() + .topic(npTopic) + .create(); + +``` + + +## System topic + +System topic is a predefined topic for internal use within Pulsar. It can be either persistent or non-persistent topic. + +System topics serve to implement certain features and eliminate dependencies on third-party components, such as transactions, heartbeat detections, topic-level policies, and resource group services. System topics empower the implementation of these features to be simplified, dependent, and flexible. Take heartbeat detections for example, you can leverage the system topic for healthcheck to internally enable producer/reader to procude/consume messages under the heartbeat namespace, which can detect whether the current service is still alive. + +There are diverse system topics depending on namespaces. The following table outlines the available system topics for each specific namespace. + +| Namespace | TopicName | Domain | Count | Usage | +|-----------|-----------|--------|-------|-------| +| pulsar/system | `transaction_coordinator_assign_${id}` | Persistent | Default 16 | Transaction coordinator | +| pulsar/system | `_transaction_log${tc_id}` | Persistent | Default 16 | Transaction log | +| pulsar/system | `resource-usage` | Non-persistent | Default 4 | Resource group service | +| host/port | `heartbeat` | Persistent | 1 | Heartbeat detection | +| User-defined-ns | [`__change_events`](concepts-multi-tenancy.md#namespace-change-events-and-topic-level-policies) | Persistent | Default 4 | Topic events | +| User-defined-ns | `__transaction_buffer_snapshot` | Persistent | One per namespace | Transaction buffer snapshots | +| User-defined-ns | `${topicName}__transaction_pending_ack` | Persistent | One per every topic subscription acknowledged with transactions | Acknowledgements with transactions | + +:::note + +* You cannot create any system topics. +* By default, system topics are disabled. To enable system topics, you need to change the following configurations in the `conf/broker.conf` or `conf/standalone.conf` file. + + ```conf + systemTopicEnabled=true + topicLevelPoliciesEnabled=true + ``` + +::: + + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](concepts-architecture-overview.md#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](/assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dictates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Message deduplication + +Message duplication occurs when a message is [persisted](concepts-architecture-overview.md#persistent-storage) by Pulsar more than once. Message deduplication is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, even if the message is received more than once. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](/assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](concepts-architecture-overview.md#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level or the topic level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the [broker](reference-terminology.md#broker) level, so you do not need to modify your Pulsar client code. Instead, you only need to make administrative changes. For details, see [Managing message deduplication](cookbooks-deduplication.md). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide effectively-once processing semantics. Messaging systems that do not offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> You can find more in-depth information in [this post](https://www.splunk.com/en_us/blog/it/exactly-once-is-not-exactly-the-same.html). + +## Delayed message delivery +Delayed message delivery enables you to consume a message later rather than immediately. In this mechanism, a message is stored in BookKeeper, `DelayedDeliveryTracker` maintains the time index(time -> messageId) in memory after published to a broker, and it is delivered to a consumer once the specific delayed time is passed. + +Delayed message delivery only works in Shared subscription type. In Exclusive and Failover subscription types, the delayed message is dispatched immediately. + +The diagram below illustrates the concept of delayed message delivery: + +![Delayed Message Delivery](/assets/message_delay.png) + +A broker saves a message without any check. When a consumer consumes a message, if the message is set to delay, then the message is added to `DelayedDeliveryTracker`. A subscription checks and gets timeout messages from `DelayedDeliveryTracker`. + +### Broker +Delayed message delivery is enabled by default. You can change it in the broker configuration file as below: + +``` + +# Whether to enable the delayed delivery for messages. +# If disabled, messages are immediately delivered and there is no tracking overhead. +delayedDeliveryEnabled=true + +# Control the ticking time for the retry of delayed message delivery, +# affecting the accuracy of the delivery time compared to the scheduled time. +# Default is 1 second. +delayedDeliveryTickTimeMillis=1000 + +``` + +### Producer +The following is an example of delayed message delivery for a producer in Java: + +```java + +// message to be delivered at the configured delay interval +producer.newMessage().deliverAfter(3L, TimeUnit.Minute).value("Hello Pulsar!").send(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-multi-tenancy.md b/site2/website/versioned_docs/version-2.8.x/concepts-multi-tenancy.md new file mode 100644 index 0000000000000..93a59557b2efc --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-multi-tenancy.md @@ -0,0 +1,67 @@ +--- +id: concepts-multi-tenancy +title: Multi Tenancy +sidebar_label: "Multi Tenancy" +original_id: concepts-multi-tenancy +--- + +Pulsar was created from the ground up as a multi-tenant system. To support multi-tenancy, Pulsar has a concept of tenants. Tenants can be spread across clusters and can each have their own [authentication and authorization](security-overview.md) scheme applied to them. They are also the administrative unit at which storage quotas, [message TTL](cookbooks-retention-expiry.md#time-to-live-ttl), and isolation policies can be managed. + +The multi-tenant nature of Pulsar is reflected mostly visibly in topic URLs, which have this structure: + +```http + +persistent://tenant/namespace/topic + +``` + +As you can see, the tenant is the most basic unit of categorization for topics (more fundamental than the namespace and topic name). + +## Tenants + +To each tenant in a Pulsar instance you can assign: + +* An [authorization](security-authorization.md) scheme +* The set of [clusters](reference-terminology.md#cluster) to which the tenant's configuration applies + +## Namespaces + +Tenants and namespaces are two key concepts of Pulsar to support multi-tenancy. + +* Pulsar is provisioned for specified tenants with appropriate capacity allocated to the tenant. +* A namespace is the administrative unit nomenclature within a tenant. The configuration policies set on a namespace apply to all the topics created in that namespace. A tenant may create multiple namespaces via self-administration using the REST API and the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. For instance, a tenant with different applications can create a separate namespace for each application. + +Names for topics in the same namespace will look like this: + +```http + +persistent://tenant/app1/topic-1 + +persistent://tenant/app1/topic-2 + +persistent://tenant/app1/topic-3 + +``` + +### Namespace change events and topic-level policies + +Pulsar is a multi-tenant event streaming system. Administrators can manage the tenants and namespaces by setting policies at different levels. However, the policies, such as retention policy and storage quota policy, are only available at a namespace level. In many use cases, users need to set a policy at the topic level. The namespace change events approach is proposed for supporting topic-level policies in an efficient way. In this approach, Pulsar is used as an event log to store namespace change events (such as topic policy changes). This approach has a few benefits: +- Avoid using ZooKeeper and introducing more loads to ZooKeeper. +- Use Pulsar as an event log for propagating the policy cache. It can scale efficiently. +- Use Pulsar SQL to query the namespace changes and audit the system. + +Each namespace has a [system topic](concepts-messaging.md#system-topic) named `__change_events`. This system topic stores change events for a given namespace. The following figure illustrates how to leverage it to update topic-level policies. + +![Leverage the system topic to update topic-level policies](/assets/system-topic-for-topic-level-policies.svg) + +1. Pulsar Admin clients communicate with the Admin Restful API to update topic-level policies. +2. Any broker that receives the Admin HTTP request publishes a topic policy change event to the corresponding system topic (`__change_events`) of the namespace. +3. Each broker that owns a namespace bundle(s) subscribes to the system topic (`__change_events`) to receive the change events of the namespace. +4. Each broker applies the change events to its policy cache. +5. Once the policy cache is updated, the broker sends the response back to the Pulsar Admin clients. + +:::note + +By default, the system topic is disabled. To enable topic-level policy (`topicLevelPoliciesEnabled`=`true`), you need to enable the system topic by setting `systemtopicenabled` to `true` in the `conf/broker.conf` or `conf/standalone.conf` file. + +::: \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-multiple-advertised-listeners.md b/site2/website/versioned_docs/version-2.8.x/concepts-multiple-advertised-listeners.md new file mode 100644 index 0000000000000..f2e1ae0aadc7c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-multiple-advertised-listeners.md @@ -0,0 +1,44 @@ +--- +id: concepts-multiple-advertised-listeners +title: Multiple advertised listeners +sidebar_label: "Multiple advertised listeners" +original_id: concepts-multiple-advertised-listeners +--- + +When a Pulsar cluster is deployed in the production environment, it may require to expose multiple advertised addresses for the broker. For example, when you deploy a Pulsar cluster in Kubernetes and want other clients, which are not in the same Kubernetes cluster, to connect to the Pulsar cluster, you need to assign a broker URL to external clients. But clients in the same Kubernetes cluster can still connect to the Pulsar cluster through the internal network of Kubernetes. + +## Advertised listeners + +To ensure clients in both internal and external networks can connect to a Pulsar cluster, Pulsar introduces `advertisedListeners` and `internalListenerName` configuration options into the [broker configuration file](reference-configuration.md#broker) to ensure that the broker supports exposing multiple advertised listeners and support the separation of internal and external network traffic. + +- The `advertisedListeners` is used to specify multiple advertised listeners. The broker uses the listener as the broker identifier in the load manager and the bundle owner data. The `advertisedListeners` is formatted as `:pulsar://:, :pulsar+ssl://:`. You can set up the `advertisedListeners` like +`advertisedListeners=internal:pulsar://192.168.1.11:6660,internal:pulsar+ssl://192.168.1.11:6651`. + +- The `internalListenerName` is used to specify the internal service URL that the broker uses. You can specify the `internalListenerName` by choosing one of the `advertisedListeners`. The broker uses the listener name of the first advertised listener as the `internalListenerName` if the `internalListenerName` is absent. + +After setting up the `advertisedListeners`, clients can choose one of the listeners as the service URL to create a connection to the broker as long as the network is accessible. However, if the client creates producers or consumer on a topic, the client must send a lookup requests to the broker for getting the owner broker, then connect to the owner broker to publish messages or consume messages. Therefore, You must allow the client to get the corresponding service URL with the same advertised listener name as the one used by the client. This helps keep client-side simple and secure. + +## Use multiple advertised listeners + +This example shows how a Pulsar client uses multiple advertised listeners. + +1. Configure multiple advertised listeners in the broker configuration file. + +```shell + +advertisedListeners={listenerName}:pulsar://xxxx:6650, +{listenerName}:pulsar+ssl://xxxx:6651 + +``` + +2. Specify the listener name for the client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://xxxx:6650") + .listenerName("external") + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-overview.md b/site2/website/versioned_docs/version-2.8.x/concepts-overview.md new file mode 100644 index 0000000000000..e8a2f4b9d321a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-overview.md @@ -0,0 +1,31 @@ +--- +id: concepts-overview +title: Pulsar Overview +sidebar_label: "Overview" +original_id: concepts-overview +--- + +Pulsar is a multi-tenant, high-performance solution for server-to-server messaging. Originally developed by Yahoo, Pulsar is under the stewardship of the [Apache Software Foundation](https://www.apache.org/). + +Key features of Pulsar are listed below: + +* Native support for multiple clusters in a Pulsar instance, with seamless [geo-replication](administration-geo.md) of messages across clusters. +* Very low publish and end-to-end latency. +* Seamless scalability to over a million topics. +* A simple [client API](concepts-clients.md) with bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). +* Multiple [subscription types](concepts-messaging.md#subscription-types) ([exclusive](concepts-messaging.md#exclusive), [shared](concepts-messaging.md#shared), and [failover](concepts-messaging.md#failover)) for topics. +* Guaranteed message delivery with [persistent message storage](concepts-architecture-overview.md#persistent-storage) provided by [Apache BookKeeper](http://bookkeeper.apache.org/). +* A serverless light-weight computing framework [Pulsar Functions](functions-overview.md) offers the capability for stream-native data processing. +* A serverless connector framework [Pulsar IO](io-overview.md), which is built on Pulsar Functions, makes it easier to move data in and out of Apache Pulsar. +* [Tiered Storage](concepts-tiered-storage.md) offloads data from hot/warm storage to cold/long-term storage (such as S3 and GCS) when the data is aging out. + +## Contents + +- [Messaging Concepts](concepts-messaging.md) +- [Architecture Overview](concepts-architecture-overview.md) +- [Pulsar Clients](concepts-clients.md) +- [Geo Replication](concepts-replication.md) +- [Multi Tenancy](concepts-multi-tenancy.md) +- [Authentication and Authorization](concepts-authentication.md) +- [Topic Compaction](concepts-topic-compaction.md) +- [Tiered Storage](concepts-tiered-storage.md) diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-proxy-sni-routing.md b/site2/website/versioned_docs/version-2.8.x/concepts-proxy-sni-routing.md new file mode 100644 index 0000000000000..7eee6df5512a2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-proxy-sni-routing.md @@ -0,0 +1,180 @@ +--- +id: concepts-proxy-sni-routing +title: Proxy support with SNI routing +sidebar_label: "Proxy support with SNI routing" +original_id: concepts-proxy-sni-routing +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +A proxy server is an intermediary server that forwards requests from multiple clients to different servers across the Internet. The proxy server acts as a "traffic cop" in both forward and reverse proxy scenarios, and benefits your system such as load balancing, performance, security, auto-scaling, and so on. + +The proxy in Pulsar acts as a reverse proxy, and creates a gateway in front of brokers. Proxies such as Apache Traffic Server (ATS), HAProxy, Nginx, and Envoy are not supported in Pulsar. These proxy-servers support **SNI routing**. SNI routing is used to route traffic to a destination without terminating the SSL connection. Layer 4 routing provides greater transparency because the outbound connection is determined by examining the destination address in the client TCP packets. + +Pulsar clients (Java, C++, Python) support [SNI routing protocol](https://github.com/apache/pulsar/wiki/PIP-60:-Support-Proxy-server-with-SNI-routing), so you can connect to brokers through the proxy. This document walks you through how to set up the ATS proxy, enable SNI routing, and connect Pulsar client to the broker through the ATS proxy. + +## ATS-SNI Routing in Pulsar +To support [layer-4 SNI routing](https://docs.trafficserver.apache.org/en/latest/admin-guide/layer-4-routing.en.html) with ATS, the inbound connection must be a TLS connection. Pulsar client supports SNI routing protocol on TLS connection, so when Pulsar clients connect to broker through ATS proxy, Pulsar uses ATS as a reverse proxy. + +Pulsar supports SNI routing for geo-replication, so brokers can connect to brokers in other clusters through the ATS proxy. + +This section explains how to set up and use ATS as a reverse proxy, so Pulsar clients can connect to brokers through the ATS proxy using the SNI routing protocol on TLS connection. + +### Set up ATS Proxy for layer-4 SNI routing +To support layer 4 SNI routing, you need to configure the `records.conf` and `ssl_server_name.conf` files. + +![Pulsar client SNI](/assets/pulsar-sni-client.png) + +The [records.config](https://docs.trafficserver.apache.org/en/latest/admin-guide/files/records.config.en.html) file is located in the `/usr/local/etc/trafficserver/` directory by default. The file lists configurable variables used by the ATS. + +To configure the `records.config` files, complete the following steps. +1. Update TLS port (`http.server_ports`) on which proxy listens, and update proxy certs (`ssl.client.cert.path` and `ssl.client.cert.filename`) to secure TLS tunneling. +2. Configure server ports (`http.connect_ports`) used for tunneling to the broker. If Pulsar brokers are listening on `4443` and `6651` ports, add the brokers service port in the `http.connect_ports` configuration. + +The following is an example. + +``` + +# PROXY TLS PORT +CONFIG proxy.config.http.server_ports STRING 4443:ssl 4080 +# PROXY CERTS FILE PATH +CONFIG proxy.config.ssl.client.cert.path STRING /proxy-cert.pem +# PROXY KEY FILE PATH +CONFIG proxy.config.ssl.client.cert.filename STRING /proxy-key.pem + + +# The range of origin server ports that can be used for tunneling via CONNECT. # Traffic Server allows tunnels only to the specified ports. Supports both wildcards (*) and ranges (e.g. 0-1023). +CONFIG proxy.config.http.connect_ports STRING 4443 6651 + +``` + +The `ssl_server_name` file is used to configure TLS connection handling for inbound and outbound connections. The configuration is determined by the SNI values provided by the inbound connection. The file consists of a set of configuration items, and each is identified by an SNI value (`fqdn`). When an inbound TLS connection is made, the SNI value from the TLS negotiation is matched with the items specified in this file. If the values match, the values specified in that item override the default values. + +The following example shows mapping of the inbound SNI hostname coming from the client, and the actual broker service URL where request should be redirected. For example, if the client sends the SNI header `pulsar-broker1`, the proxy creates a TLS tunnel by redirecting request to the `pulsar-broker1:6651` service URL. + +``` + +server_config = { + { + fqdn = 'pulsar-broker-vip', + # Forward to Pulsar broker which is listening on 6651 + tunnel_route = 'pulsar-broker-vip:6651' + }, + { + fqdn = 'pulsar-broker1', + # Forward to Pulsar broker-1 which is listening on 6651 + tunnel_route = 'pulsar-broker1:6651' + }, + { + fqdn = 'pulsar-broker2', + # Forward to Pulsar broker-2 which is listening on 6651 + tunnel_route = 'pulsar-broker2:6651' + }, +} + +``` + +After you configure the `ssl_server_name.config` and `records.config` files, the ATS-proxy server handles SNI routing and creates TCP tunnel between the client and the broker. + +### Configure Pulsar-client with SNI routing +ATS SNI-routing works only with TLS. You need to enable TLS for the ATS proxy and brokers first, configure the SNI routing protocol, and then connect Pulsar clients to brokers through ATS proxy. Pulsar clients support SNI routing by connecting to the proxy, and sending the target broker URL to the SNI header. This process is processed internally. You only need to configure the following proxy configuration initially when you create a Pulsar client to use the SNI routing protocol. + +````mdx-code-block + + + + +```java + +String brokerServiceUrl = “pulsar+ssl://pulsar-broker-vip:6651/”; +String proxyUrl = “pulsar+ssl://ats-proxy:443”; +ClientBuilder clientBuilder = PulsarClient.builder() + .serviceUrl(brokerServiceUrl) + .tlsTrustCertsFilePath(TLS_TRUST_CERT_FILE_PATH) + .enableTls(true) + .allowTlsInsecureConnection(false) + .proxyServiceUrl(proxyUrl, ProxyProtocol.SNI) + .operationTimeout(1000, TimeUnit.MILLISECONDS); + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", TLS_CLIENT_CERT_FILE_PATH); +authParams.put("tlsKeyFile", TLS_CLIENT_KEY_FILE_PATH); +clientBuilder.authentication(AuthenticationTls.class.getName(), authParams); + +PulsarClient pulsarClient = clientBuilder.build(); + +``` + + + + +```c++ + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://ats-proxy:443", config); + +``` + + + + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://ats-proxy:443", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + + + + +```` + +### Pulsar geo-replication with SNI routing +You can use the ATS proxy for geo-replication. Pulsar brokers can connect to brokers in geo-replication by using SNI routing. To enable SNI routing for broker connection cross clusters, you need to configure SNI proxy URL to the cluster metadata. If you have configured SNI proxy URL in the cluster metadata, you can connect to broker cross clusters through the proxy over SNI routing. + +![Pulsar client SNI](/assets/pulsar-sni-geo.png) + +In this example, a Pulsar cluster is deployed into two separate regions, `us-west` and `us-east`. Both regions are configured with ATS proxy, and brokers in each region run behind the ATS proxy. We configure the cluster metadata for both clusters, so brokers in one cluster can use SNI routing and connect to brokers in other clusters through the ATS proxy. + +(a) Configure the cluster metadata for `us-east` with `us-east` broker service URL and `us-east` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://east-broker-vip:6651 \ +--url http://east-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://east-ats-proxy:443 + +``` + +(b) Configure the cluster metadata for `us-west` with `us-west` broker service URL and `us-west` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://west-broker-vip:6651 \ +--url http://west-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://west-ats-proxy:443 + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-replication.md b/site2/website/versioned_docs/version-2.8.x/concepts-replication.md new file mode 100644 index 0000000000000..799f0eb4d92c6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-replication.md @@ -0,0 +1,9 @@ +--- +id: concepts-replication +title: Geo Replication +sidebar_label: "Geo Replication" +original_id: concepts-replication +--- + +Pulsar enables messages to be produced and consumed in different geo-locations. For instance, your application may be publishing data in one region or market and you would like to process it for consumption in other regions or markets. [Geo-replication](administration-geo.md) in Pulsar enables you to do that. + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-tiered-storage.md b/site2/website/versioned_docs/version-2.8.x/concepts-tiered-storage.md new file mode 100644 index 0000000000000..f6988e53a8cd4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-tiered-storage.md @@ -0,0 +1,18 @@ +--- +id: concepts-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: concepts-tiered-storage +--- + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from BookKeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](/assets/pulsar-tiered-storage.png) + +> Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3, Google Cloud Storage (GCS), and filesystem for [long term store](https://pulsar.apache.org/docs/en/cookbooks-tiered-storage/). Offloading to long term storage triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on BookKeeper, and the broker will copy the backlog data to long term storage. The original data will then be deleted from BookKeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-topic-compaction.md b/site2/website/versioned_docs/version-2.8.x/concepts-topic-compaction.md new file mode 100644 index 0000000000000..34b7ed7fbbd31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-topic-compaction.md @@ -0,0 +1,37 @@ +--- +id: concepts-topic-compaction +title: Topic Compaction +sidebar_label: "Topic Compaction" +original_id: concepts-topic-compaction +--- + +Pulsar was built with highly scalable [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data as a primary objective. Pulsar topics enable you to persistently store as many unacknowledged messages as you need while preserving message ordering. By default, Pulsar stores *all* unacknowledged/unprocessed messages produced on a topic. Accumulating many unacknowledged messages on a topic is necessary for many Pulsar use cases but it can also be very time intensive for Pulsar consumers to "rewind" through the entire log of messages. + +> For a more practical guide to topic compaction, see the [Topic compaction cookbook](cookbooks-compaction.md). + +For some use cases consumers don't need a complete "image" of the topic log. They may only need a few values to construct a more "shallow" image of the log, perhaps even just the most recent value. For these kinds of use cases Pulsar offers **topic compaction**. When you run compaction on a topic, Pulsar goes through a topic's backlog and removes messages that are *obscured* by later messages, i.e. it goes through the topic on a per-key basis and leaves only the most recent message associated with that key. + +Pulsar's topic compaction feature: + +* Allows for faster "rewind" through topic logs +* Applies only to [persistent topics](concepts-architecture-overview.md#persistent-storage) +* Triggered automatically when the backlog reaches a certain size or can be triggered manually via the command line. See the [Topic compaction cookbook](cookbooks-compaction.md) +* Is conceptually and operationally distinct from [retention and expiry](concepts-messaging.md#message-retention-and-expiry). Topic compaction *does*, however, respect retention. If retention has removed a message from the message backlog of a topic, the message will also not be readable from the compacted topic ledger. + +> #### Topic compaction example: the stock ticker +> An example use case for a compacted Pulsar topic would be a stock ticker topic. On a stock ticker topic, each message bears a timestamped dollar value for stocks for purchase (with the message key holding the stock symbol, e.g. `AAPL` or `GOOG`). With a stock ticker you may care only about the most recent value(s) of the stock and have no interest in historical data (i.e. you don't need to construct a complete image of the topic's sequence of messages per key). Compaction would be highly beneficial in this case because it would keep consumers from needing to rewind through obscured messages. + + +## How topic compaction works + +When topic compaction is triggered [via the CLI](cookbooks-compaction.md), Pulsar will iterate over the entire topic from beginning to end. For each key that it encounters the compaction routine will keep a record of the latest occurrence of that key. + +After that, the broker will create a new [BookKeeper ledger](concepts-architecture-overview.md#ledgers) and make a second iteration through each message on the topic. For each message, if the key matches the latest occurrence of that key, then the key's data payload, message ID, and metadata will be written to the newly created ledger. If the key doesn't match the latest then the message will be skipped and left alone. If any given message has an empty payload, it will be skipped and considered deleted (akin to the concept of [tombstones](https://en.wikipedia.org/wiki/Tombstone_(data_store)) in key-value databases). At the end of this second iteration through the topic, the newly created BookKeeper ledger is closed and two things are written to the topic's metadata: the ID of the BookKeeper ledger and the message ID of the last compacted message (this is known as the **compaction horizon** of the topic). Once this metadata is written compaction is complete. + +After the initial compaction operation, the Pulsar [broker](reference-terminology.md#broker) that owns the topic is notified whenever any future changes are made to the compaction horizon and compacted backlog. When such changes occur: + +* Clients (consumers and readers) that have read compacted enabled will attempt to read messages from a topic and either: + * Read from the topic like normal (if the message ID is greater than or equal to the compaction horizon) or + * Read beginning at the compaction horizon (if the message ID is lower than the compaction horizon) + + diff --git a/site2/website/versioned_docs/version-2.8.x/concepts-transactions.md b/site2/website/versioned_docs/version-2.8.x/concepts-transactions.md new file mode 100644 index 0000000000000..08490ba06b5d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/concepts-transactions.md @@ -0,0 +1,30 @@ +--- +id: transactions +title: Transactions +sidebar_label: "Overview" +original_id: transactions +--- + +Transactional semantics enable event streaming applications to consume, process, and produce messages in one atomic operation. In Pulsar, a producer or consumer can work with messages across multiple topics and partitions and ensure those messages are processed as a single unit. + +The following concepts help you understand Pulsar transactions. + +## Transaction coordinator and transaction log +The transaction coordinator maintains the topics and subscriptions that interact in a transaction. When a transaction is committed, the transaction coordinator interacts with the topic owner broker to complete the transaction. + +The transaction coordinator maintains the entire life cycle of transactions, and prevents a transaction from incorrect status. + +The transaction coordinator handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +All the transaction metadata is persisted in the transaction log. The transaction log is backed by a Pulsar topic. After the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +## Transaction ID +The transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +## Transaction buffer +Messages produced within a transaction are stored in the transaction buffer. The messages in transaction buffer are not materialized (visible) to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +## Pending acknowledge state +Message acknowledges within a transaction are maintained by the pending acknowledge state before the transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log. The pending acknowledge log is backed by a Pulsar topic. A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-bookkeepermetadata.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-bookkeepermetadata.md new file mode 100644 index 0000000000000..b0fa98dc3b65d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-bookkeepermetadata.md @@ -0,0 +1,21 @@ +--- +id: cookbooks-bookkeepermetadata +title: BookKeeper Ledger Metadata +original_id: cookbooks-bookkeepermetadata +--- + +Pulsar stores data on BookKeeper ledgers, you can understand the contents of a ledger by inspecting the metadata attached to the ledger. +Such metadata are stored on ZooKeeper and they are readable using BookKeeper APIs. + +Description of current metadata: + +| Scope | Metadata name | Metadata value | +| ------------- | ------------- | ------------- | +| All ledgers | application | 'pulsar' | +| All ledgers | component | 'managed-ledger', 'schema', 'compacted-topic' | +| Managed ledgers | pulsar/managed-ledger | name of the ledger | +| Cursor | pulsar/cursor | name of the cursor | +| Compacted topic | pulsar/compactedTopic | name of the original topic | +| Compacted topic | pulsar/compactedTo | id of the last compacted message | + + diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-compaction.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-compaction.md new file mode 100644 index 0000000000000..dfa314727241a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-compaction.md @@ -0,0 +1,142 @@ +--- +id: cookbooks-compaction +title: Topic compaction +sidebar_label: "Topic compaction" +original_id: cookbooks-compaction +--- + +Pulsar's [topic compaction](concepts-topic-compaction.md#compaction) feature enables you to create **compacted** topics in which older, "obscured" entries are pruned from the topic, allowing for faster reads through the topic's history (which messages are deemed obscured/outdated/irrelevant will depend on your use case). + +To use compaction: + +* You need to give messages keys, as topic compaction in Pulsar takes place on a *per-key basis* (i.e. messages are compacted based on their key). For a stock ticker use case, the stock symbol---e.g. `AAPL` or `GOOG`---could serve as the key (more on this [below](#when-should-i-use-compacted-topics)). Messages without keys will be left alone by the compaction process. +* Compaction can be configured to run [automatically](#configuring-compaction-to-run-automatically), or you can manually [trigger](#triggering-compaction-manually) compaction using the Pulsar administrative API. +* Your consumers must be [configured](#consumer-configuration) to read from compacted topics ([Java consumers](#java), for example, have a `readCompacted` setting that must be set to `true`). If this configuration is not set, consumers will still be able to read from the non-compacted topic. + + +> Compaction only works on messages that have keys (as in the stock ticker example the stock symbol serves as the key for each message). Keys can thus be thought of as the axis along which compaction is applied. Messages that don't have keys are simply ignored by compaction. + +## When should I use compacted topics? + +The classic example of a topic that could benefit from compaction would be a stock ticker topic through which consumers can access up-to-date values for specific stocks. Imagine a scenario in which messages carrying stock value data use the stock symbol as the key (`GOOG`, `AAPL`, `TWTR`, etc.). Compacting this topic would give consumers on the topic two options: + +* They can read from the "original," non-compacted topic in case they need access to "historical" values, i.e. the entirety of the topic's messages. +* They can read from the compacted topic if they only want to see the most up-to-date messages. + +Thus, if you're using a Pulsar topic called `stock-values`, some consumers could have access to all messages in the topic (perhaps because they're performing some kind of number crunching of all values in the last hour) while the consumers used to power the real-time stock ticker only see the compacted topic (and thus aren't forced to process outdated messages). Which variant of the topic any given consumer pulls messages from is determined by the consumer's [configuration](#consumer-configuration). + +> One of the benefits of compaction in Pulsar is that you aren't forced to choose between compacted and non-compacted topics, as the compaction process leaves the original topic as-is and essentially adds an alternate topic. In other words, you can run compaction on a topic and consumers that need access to the non-compacted version of the topic will not be adversely affected. + + +## Configuring compaction to run automatically + +Tenant administrators can configure a policy for compaction at the namespace level. The policy specifies how large the topic backlog can grow before compaction is triggered. + +For example, to trigger compaction when the backlog reaches 100MB: + +```bash + +$ bin/pulsar-admin namespaces set-compaction-threshold \ + --threshold 100M my-tenant/my-namespace + +``` + +Configuring the compaction threshold on a namespace will apply to all topics within that namespace. + +## Triggering compaction manually + +In order to run compaction on a topic, you need to use the [`topics compact`](reference-pulsar-admin.md#topics-compact) command for the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. Here's an example: + +```bash + +$ bin/pulsar-admin topics compact \ + persistent://my-tenant/my-namespace/my-topic + +``` + +The `pulsar-admin` tool runs compaction via the Pulsar {@inject: rest:REST:/} API. To run compaction in its own dedicated process, i.e. *not* through the REST API, you can use the [`pulsar compact-topic`](reference-cli-tools.md#pulsar-compact-topic) command. Here's an example: + +```bash + +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant-namespace/my-topic + +``` + +> Running compaction in its own process is recommended when you want to avoid interfering with the broker's performance. Broker performance should only be affected, however, when running compaction on topics with a large keyspace (i.e when there are many keys on the topic). The first phase of the compaction process keeps a copy of each key in the topic, which can create memory pressure as the number of keys grows. Using the `pulsar-admin topics compact` command to run compaction through the REST API should present no issues in the overwhelming majority of cases; using `pulsar compact-topic` should correspondingly be considered an edge case. + +The `pulsar compact-topic` command communicates with [ZooKeeper](https://zookeeper.apache.org) directly. In order to establish communication with ZooKeeper, though, the `pulsar` CLI tool will need to have a valid [broker configuration](reference-configuration.md#broker). You can either supply a proper configuration in `conf/broker.conf` or specify a non-default location for the configuration: + +```bash + +$ bin/pulsar compact-topic \ + --broker-conf /path/to/broker.conf \ + --topic persistent://my-tenant/my-namespace/my-topic + +# If the configuration is in conf/broker.conf +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant/my-namespace/my-topic + +``` + +#### When should I trigger compaction? + +How often you [trigger compaction](#triggering-compaction-manually) will vary widely based on the use case. If you want a compacted topic to be extremely speedy on read, then you should run compaction fairly frequently. + +## Consumer configuration + +Pulsar consumers and readers need to be configured to read from compacted topics. The sections below show you how to enable compacted topic reads for Pulsar's language clients. + +### Java + +In order to read from a compacted topic using a Java consumer, the `readCompacted` parameter must be set to `true`. Here's an example consumer for a compacted topic: + +```java + +Consumer compactedTopicConsumer = client.newConsumer() + .topic("some-compacted-topic") + .readCompacted(true) + .subscribe(); + +``` + +As mentioned above, topic compaction in Pulsar works on a *per-key basis*. That means that messages that you produce on compacted topics need to have keys (the content of the key will depend on your use case). Messages that don't have keys will be ignored by the compaction process. Here's an example Pulsar message with a key: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +``` + +The example below shows a message with a key being produced on a compacted Pulsar topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer compactedTopicProducer = client.newProducer() + .topic("some-compacted-topic") + .create(); + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +compactedTopicProducer.send(msg); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-deduplication.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-deduplication.md new file mode 100644 index 0000000000000..f7f9e3d7bb425 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-deduplication.md @@ -0,0 +1,151 @@ +--- +id: cookbooks-deduplication +title: Message deduplication +sidebar_label: "Message deduplication" +original_id: cookbooks-deduplication +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +When **Message deduplication** is enabled, it ensures that each message produced on Pulsar topics is persisted to disk *only once*, even if the message is produced more than once. Message deduplication is handled automatically on the server side. + +To use message deduplication in Pulsar, you need to configure your Pulsar brokers and clients. + +## How it works + +You can enable or disable message deduplication at the namespace level or the topic level. By default, it is disabled on all namespaces or topics. You can enable it in the following ways: + +* Enable deduplication for all namespaces/topics at the broker-level. +* Enable deduplication for a specific namespace with the `pulsar-admin namespaces` interface. +* Enable deduplication for a specific topic with the `pulsar-admin topics` interface. + +## Configure message deduplication + +You can configure message deduplication in Pulsar using the [`broker.conf`](reference-configuration.md#broker) configuration file. The following deduplication-related parameters are available. + +Parameter | Description | Default +:---------|:------------|:------- +`brokerDeduplicationEnabled` | Sets the default behavior for message deduplication in the Pulsar broker. If it is set to `true`, message deduplication is enabled on all namespaces/topics. If it is set to `false`, you have to enable or disable deduplication at the namespace level or the topic level. | `false` +`brokerDeduplicationMaxNumberOfProducers` | The maximum number of producers for which information is stored for deduplication purposes. | `10000` +`brokerDeduplicationEntriesInterval` | The number of entries after which a deduplication informational snapshot is taken. A larger interval leads to fewer snapshots being taken, though this lengthens the topic recovery time (the time required for entries published after the snapshot to be replayed). | `1000` +`brokerDeduplicationSnapshotIntervalSeconds`| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |`120` +`brokerDeduplicationProducerInactivityTimeoutMinutes` | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | `360` (6 hours) + +### Set default value at the broker-level + +By default, message deduplication is *disabled* on all Pulsar namespaces/topics. To enable it on all namespaces/topics, set the `brokerDeduplicationEnabled` parameter to `true` and re-start the broker. + +Even if you set the value for `brokerDeduplicationEnabled`, enabling or disabling via Pulsar admin CLI overrides the default settings at the broker-level. + +### Enable message deduplication + +Though message deduplication is disabled by default at the broker level, you can enable message deduplication for a specific namespace or topic using the [`pulsar-admin namespaces set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. You can use the `--enable`/`-e` flag and specify the namespace/topic. + +The following example shows how to enable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --enable # or just -e + +``` + +### Disable message deduplication + +Even if you enable message deduplication at the broker level, you can disable message deduplication for a specific namespace or topic using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. Use the `--disable`/`-d` flag and specify the namespace/topic. + +The following example shows how to disable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --disable # or just -d + +``` + +## Pulsar clients + +If you enable message deduplication in Pulsar brokers, you need complete the following tasks for your client producers: + +1. Specify a name for the producer. +1. Set the message timeout to `0` (namely, no timeout). + +The instructions for Java, Python, and C++ clients are different. + +````mdx-code-block + + + +To enable message deduplication on a [Java producer](client-libraries-java.md#producers), set the producer name using the `producerName` setter, and set the timeout to `0` using the `sendTimeout` setter. + +```java + +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; +import java.util.concurrent.TimeUnit; + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +Producer producer = pulsarClient.newProducer() + .producerName("producer-1") + .topic("persistent://public/default/topic-1") + .sendTimeout(0, TimeUnit.SECONDS) + .create(); + +``` + + + + +To enable message deduplication on a [Python producer](client-libraries-python.md#producers), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```python + +import pulsar + +client = pulsar.Client("pulsar://localhost:6650") +producer = client.create_producer( + "persistent://public/default/topic-1", + producer_name="producer-1", + send_timeout_millis=0) + +``` + + + + +To enable message deduplication on a [C++ producer](client-libraries-cpp.md#producer), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://some-tenant/ns1/topic-1"; +std::string producerName = "producer-1"; + +Client client(serviceUrl); + +ProducerConfiguration producerConfig; +producerConfig.setSendTimeout(0); +producerConfig.setProducerName(producerName); + +Producer producer; + +Result result = client.createProducer(topic, producerConfig, producer); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-encryption.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-encryption.md new file mode 100644 index 0000000000000..f0d8fb8735eb6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-encryption.md @@ -0,0 +1,184 @@ +--- +id: cookbooks-encryption +title: Pulsar Encryption +sidebar_label: "Encryption" +original_id: cookbooks-encryption +--- + +Pulsar encryption allows applications to encrypt messages at the producer and decrypt at the consumer. Encryption is performed using the public/private key pair configured by the application. Encrypted messages can only be decrypted by consumers with a valid key. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). The AES key(data key) is encrypted using application provided ECDSA/RSA key pair, as a result there is no need to share the secret with everyone. + +Key is a public/private key pair used for encryption/decryption. The producer key is the public key, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. This key is used to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key(in this case the consumer) will be able to decrypt the data key which is used to decrypt the message. + +A message can be encrypted with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message + +Pulsar does not store the encryption key anywhere in the pulsar service. If you lose/delete the private key, your message is irretrievably lost, and is unrecoverable + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Here are the steps to get started: + +1. Create your ECDSA or RSA public/private key pair. + +```shell + +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem + +``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which will be invoked by Pulsar client to load the key. +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key") +5. Add CryptoKeyReader implementation to producer/consumer config: conf.setCryptoKeyReader(keyReader) +6. Sample producer application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-tenant/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); + +``` + +7. Sample Consumer Application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-tenant//my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); + +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after a certain number of messages are published. The asymmetric public key is automatically fetched by producer every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enabling encryption at the producer application: +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. This can be done in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys +1. You grant access to one of the private keys from the pairs used by producer + +In some cases, the producer may want to encrypt the messages with multiple keys. For this, add all such keys to the config. Consumer will be able to decrypt the message, as long as it has access to at least one of the keys. + +E.g: If messages needs to be encrypted using 2 keys myapp.messagekey1 and myapp.messagekey2, + +```java + +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypting encrypted messages at the consumer application: +Consumers require access one of the private keys to decrypt messages produced by the producer. If you would like to receive encrypted messages, create a public/private key and give your public key to the producer application to encrypt messages using your public key. + +## Handling Failures: +* Producer/ Consumer loses access to the key + * Producer action will fail indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption failed due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. +Application will never be able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contain batch messages, client will not be able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application will notice backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip/discard backlogged messages. + diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-message-queue.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-message-queue.md new file mode 100644 index 0000000000000..eb43cbde5fb81 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-message-queue.md @@ -0,0 +1,127 @@ +--- +id: cookbooks-message-queue +title: Using Pulsar as a message queue +sidebar_label: "Message queue" +original_id: cookbooks-message-queue +--- + +Message queues are essential components of many large-scale data architectures. If every single work object that passes through your system absolutely *must* be processed in spite of the slowness or downright failure of this or that system component, there's a good chance that you'll need a message queue to step in and ensure that unprocessed data is retained---with correct ordering---until the required actions are taken. + +Pulsar is a great choice for a message queue because: + +* it was built with [persistent message storage](concepts-architecture-overview.md#persistent-storage) in mind +* it offers automatic load balancing across [consumers](reference-terminology.md#consumer) for messages on a topic (or custom load balancing if you wish) + +> You can use the same Pulsar installation to act as a real-time message bus and as a message queue if you wish (or just one or the other). You can set aside some topics for real-time purposes and other topics for message queue purposes (or use specific namespaces for either purpose if you wish). + + +# Client configuration changes + +To use a Pulsar [topic](reference-terminology.md#topic) as a message queue, you should distribute the receiver load on that topic across several consumers (the optimal number of consumers will depend on the load). Each consumer must: + +* Establish a [shared subscription](concepts-messaging.md#shared) and use the same subscription name as the other consumers (otherwise the subscription is not shared and the consumers can't act as a processing ensemble) +* If you'd like to have tight control over message dispatching across consumers, set the consumers' **receiver queue** size very low (potentially even to 0 if necessary). Each Pulsar [consumer](reference-terminology.md#consumer) has a receiver queue that determines how many messages the consumer will attempt to fetch at a time. A receiver queue of 1000 (the default), for example, means that the consumer will attempt to process 1000 messages from the topic's backlog upon connection. Setting the receiver queue to zero essentially means ensuring that each consumer is only doing one thing at a time. + + The downside to restricting the receiver queue size of consumers is that that limits the potential throughput of those consumers and cannot be used with [partitioned topics](reference-terminology.md#partitioned-topic). Whether the performance/control trade-off is worthwhile will depend on your use case. + +## Java clients + +Here's an example Java consumer configuration that uses a shared subscription: + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; +import org.apache.pulsar.client.api.SubscriptionType; + +String SERVICE_URL = "pulsar://localhost:6650"; +String TOPIC = "persistent://public/default/mq-topic-1"; +String subscription = "sub-1"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl(SERVICE_URL) + .build(); + +Consumer consumer = client.newConsumer() + .topic(TOPIC) + .subscriptionName(subscription) + .subscriptionType(SubscriptionType.Shared) + // If you'd like to restrict the receiver queue size + .receiverQueueSize(10) + .subscribe(); + +``` + +## Python clients + +Here's an example Python consumer configuration that uses a shared subscription: + +```python + +from pulsar import Client, ConsumerType + +SERVICE_URL = "pulsar://localhost:6650" +TOPIC = "persistent://public/default/mq-topic-1" +SUBSCRIPTION = "sub-1" + +client = Client(SERVICE_URL) +consumer = client.subscribe( + TOPIC, + SUBSCRIPTION, + # If you'd like to restrict the receiver queue size + receiver_queue_size=10, + consumer_type=ConsumerType.Shared) + +``` + +## C++ clients + +Here's an example C++ consumer configuration that uses a shared subscription: + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://public/defaultmq-topic-1"; +std::string subscription = "sub-1"; + +Client client(serviceUrl); + +ConsumerConfiguration consumerConfig; +consumerConfig.setConsumerType(ConsumerType.ConsumerShared); +// If you'd like to restrict the receiver queue size +consumerConfig.setReceiverQueueSize(10); + +Consumer consumer; + +Result result = client.subscribe(topic, subscription, consumerConfig, consumer); + +``` + +## Go clients + +Here is an example of a Go consumer configuration that uses a shared subscription: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "persistent://public/default/mq-topic-1", + SubscriptionName: "sub-1", + Type: pulsar.Shared, + ReceiverQueueSize: 10, // If you'd like to restrict the receiver queue size +}) +if err != nil { + log.Fatal(err) +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-non-persistent.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-non-persistent.md new file mode 100644 index 0000000000000..178301e86eb8d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-non-persistent.md @@ -0,0 +1,63 @@ +--- +id: cookbooks-non-persistent +title: Non-persistent messaging +sidebar_label: "Non-persistent messaging" +original_id: cookbooks-non-persistent +--- + +**Non-persistent topics** are Pulsar topics in which message data is *never* [persistently stored](concepts-architecture-overview.md#persistent-storage) and kept only in memory. This cookbook provides: + +* A basic [conceptual overview](#overview) of non-persistent topics +* Information about [configurable parameters](#configuration) related to non-persistent topics +* A guide to the [CLI interface](#cli) for managing non-persistent topics + +## Overview + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar [broker](reference-terminology.md#broker) or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more high-level information about non-persistent topics, see the [Concepts and Architecture](concepts-messaging.md#non-persistent-topics) documentation. + +## Using + +> In order to use non-persistent topics, they must be [enabled](#enabling) in your Pulsar broker configuration. + +In order to use non-persistent topics, you only need to differentiate them by name when interacting with them. This [`pulsar-client produce`](reference-cli-tools.md#pulsar-client-produce) command, for example, would produce one message on a non-persistent topic in a standalone cluster: + +```bash + +$ bin/pulsar-client produce non-persistent://public/default/example-np-topic \ + --num-produce 1 \ + --messages "This message will be stored only in memory" + +``` + +> For a more thorough guide to non-persistent topics from an administrative perspective, see the [Non-persistent topics](admin-api-topics.md) guide. + +## Enabling + +In order to enable non-persistent topics in a Pulsar broker, the [`enableNonPersistentTopics`](reference-configuration.md#broker-enableNonPersistentTopics) must be set to `true`. This is the default, and so you won't need to take any action to enable non-persistent messaging. + + +> #### Configuration for standalone mode +> If you're running Pulsar in standalone mode, the same configurable parameters are available but in the [`standalone.conf`](reference-configuration.md#standalone) configuration file. + +If you'd like to enable *only* non-persistent topics in a broker, you can set the [`enablePersistentTopics`](reference-configuration.md#broker-enablePersistentTopics) parameter to `false` and the `enableNonPersistentTopics` parameter to `true`. + +## Managing with cli + +Non-persistent topics can be managed using the [`pulsar-admin non-persistent`](reference-pulsar-admin.md#non-persistent) command-line interface. With that interface you can perform actions like [create a partitioned non-persistent topic](reference-pulsar-admin.md#non-persistent-create-partitioned-topic), get [stats](reference-pulsar-admin.md#non-persistent-stats) for a non-persistent topic, [list](reference-pulsar-admin.md) non-persistent topics under a namespace, and more. + +## Using with Pulsar clients + +You shouldn't need to make any changes to your Pulsar clients to use non-persistent messaging beyond making sure that you use proper [topic names](#using) with `non-persistent` as the topic type. + diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-partitioned.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-partitioned.md new file mode 100644 index 0000000000000..fb9ac354cc6d6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-partitioned.md @@ -0,0 +1,7 @@ +--- +id: cookbooks-partitioned +title: Partitioned topics +sidebar_label: "Partitioned Topics" +original_id: cookbooks-partitioned +--- +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-retention-expiry.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-retention-expiry.md new file mode 100644 index 0000000000000..2f0993256a8ae --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-retention-expiry.md @@ -0,0 +1,422 @@ +--- +id: cookbooks-retention-expiry +title: Message retention and expiry +sidebar_label: "Message retention and expiry" +original_id: cookbooks-retention-expiry +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar brokers are responsible for handling messages that pass through Pulsar, including [persistent storage](concepts-architecture-overview.md#persistent-storage) of messages. By default, for each topic, brokers only retain messages that are in at least one backlog. A backlog is the set of unacknowledged messages for a particular subscription. As a topic can have multiple subscriptions, a topic can have multiple backlogs. + +As a consequence, no messages are retained (by default) on a topic that has not had any subscriptions created for it. + +(Note that messages that are no longer being stored are not necessarily immediately deleted, and may in fact still be accessible until the next ledger rollover. Because clients cannot predict when rollovers may happen, it is not wise to rely on a rollover not happening at an inconvenient point in time.) + +In Pulsar, you can modify this behavior, with namespace granularity, in two ways: + +* You can persistently store messages that are not within a backlog (because they've been acknowledged by on every existing subscription, or because there are no subscriptions) by setting [retention policies](#retention-policies). +* Messages that are not acknowledged within a specified timeframe can be automatically acknowledged, by specifying the [time to live](#time-to-live-ttl) (TTL). + +Pulsar's [admin interface](admin-api-overview.md) enables you to manage both retention policies and TTL with namespace granularity (and thus within a specific tenant and either on a specific cluster or in the [`global`](concepts-architecture-overview.md#global-cluster) cluster). + + +> #### Retention and TTL solve two different problems +> * Message retention: Keep the data for at least X hours (even if acknowledged) +> * Time-to-live: Discard data after some time (by automatically acknowledging) +> +> Most applications will want to use at most one of these. + + +## Retention policies + +By default, when a Pulsar message arrives at a broker, the message is stored until it has been acknowledged on all subscriptions, at which point it is marked for deletion. You can override this behavior and retain messages that have already been acknowledged on all subscriptions by setting a *retention policy* for all topics in a given namespace. Retention is based on both a *size limit* and a *time limit*. + +Retention policies are useful when you use the Reader interface. The Reader interface does not use acknowledgements, and messages do not exist within backlogs. It is required to configure retention for Reader-only use cases. + +When you set a retention policy on topics in a namespace, you must set **both** a *size limit* and a *time limit*. You can refer to the following table to set retention policies in `pulsar-admin` and Java. + +|Time limit|Size limit| Message retention | +|----------|----------|------------------------| +| -1 | -1 | Infinite retention | +| -1 | >0 | Based on the size limit | +| >0 | -1 | Based on the time limit | +| 0 | 0 | Disable message retention (by default) | +| 0 | >0 | Invalid | +| >0 | 0 | Invalid | +| >0 | >0 | Acknowledged messages or messages with no active subscription will not be retained when either time or size reaches the limit. | + +The retention settings apply to all messages on topics that do not have any subscriptions, or to messages that have been acknowledged by all subscriptions. The retention policy settings do not affect unacknowledged messages on topics with subscriptions. The unacknowledged messages are controlled by the backlog quota. + +When a retention limit on a topic is exceeded, the oldest message is marked for deletion until the set of retained messages falls within the specified limits again. + +### Defaults + +You can set message retention at instance level with the following two parameters: `defaultRetentionTimeInMinutes` and `defaultRetentionSizeInMB`. Both parameters are set to `0` by default. + +For more information of the two parameters, refer to the [`broker.conf`](reference-configuration.md#broker) configuration file. + +### Set retention policy + +You can set a retention policy for a namespace by specifying the namespace, a size limit and a time limit in `pulsar-admin`, REST API and Java. + +````mdx-code-block + + + +You can use the [`set-retention`](reference-pulsar-admin.md#namespaces-set-retention) subcommand and specify a namespace, a size limit using the `-s`/`--size` flag, and a time limit using the `-t`/`--time` flag. + +In the following example, the size limit is set to 10 GB and the time limit is set to 3 hours for each topic within the `my-tenant/my-ns` namespace. +- When the size of messages reaches 10 GB on a topic within 3 hours, the acknowledged messages will not be retained. +- After 3 hours, even if the message size is less than 10 GB, the acknowledged messages will not be retained. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 10G \ + --time 3h + +``` + +In the following example, the time is not limited and the size limit is set to 1 TB. The size limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 1T \ + --time -1 + +``` + +In the following example, the size is not limited and the time limit is set to 3 hours. The time limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time 3h + +``` + +To achieve infinite retention, set both values to `-1`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time -1 + +``` + +To disable the retention policy, set both values to `0`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 0 \ + --time 0 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + +:::note + +To disable the retention policy, you need to set both the size and time limit to `0`. Set either size or time limit to `0` is invalid. + +::: + + + + +```java + +int retentionTime = 10; // 10 minutes +int retentionSize = 500; // 500 megabytes +RetentionPolicies policies = new RetentionPolicies(retentionTime, retentionSize); +admin.namespaces().setRetention(namespace, policies); + +``` + + + + +```` + +### Get retention policy + +You can fetch the retention policy for a namespace by specifying the namespace. The output will be a JSON object with two keys: `retentionTimeInMinutes` and `retentionSizeInMB`. + +#### pulsar-admin + +Use the [`get-retention`](reference-pulsar-admin.md#namespaces) subcommand and specify the namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-retention my-tenant/my-ns +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 500 +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + +#### Java + +```java + +admin.namespaces().getRetention(namespace); + +``` + +## Backlog quotas + +*Backlogs* are sets of unacknowledged messages for a topic that have been stored by bookies. Pulsar stores all unacknowledged messages in backlogs until they are processed and acknowledged. + +You can control the allowable size of backlogs, at the namespace level, using *backlog quotas*. Setting a backlog quota involves setting: + +TODO: Expand on is this per backlog or per topic? + +* an allowable *size threshold* for each topic in the namespace +* a *retention policy* that determines which action the [broker](reference-terminology.md#broker) takes if the threshold is exceeded. + +The following retention policies are available: + +Policy | Action +:------|:------ +`producer_request_hold` | The broker will hold and not persist produce request payload +`producer_exception` | The broker will disconnect from the client by throwing an exception +`consumer_backlog_eviction` | The broker will begin discarding backlog messages + + +> #### Beware the distinction between retention policy types +> As you may have noticed, there are two definitions of the term "retention policy" in Pulsar, one that applies to persistent storage of messages not in backlogs, and one that applies to messages within backlogs. + + +Backlog quotas are handled at the namespace level. They can be managed via: + +### Set size/time thresholds and backlog retention policies + +You can set a size and/or time threshold and backlog retention policy for all of the topics in a [namespace](reference-terminology.md#namespace) by specifying the namespace, a size limit and/or a time limit in second, and a policy by name. + +#### pulsar-admin + +Use the [`set-backlog-quota`](reference-pulsar-admin.md#namespaces) subcommand and specify a namespace, a size limit using the `-l`/`--limit` flag and the `-lt`/`--limitTime` flag (for 2.8.1 and later versions only) to limit backlog, a retention policy using the `-p`/`--policy` flag, and a policy type using `-t`/`--type` (for 2.8.1 and later versions only, defaults to `destination_storage`). + +##### Example + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ + --limit 2G \ + --policy producer_request_hold + +``` + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ + --limitTime 3600 \ + --policy producer_request_hold \ + --type message_age + +``` + + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + +#### Java + +```java + +long sizeLimit = 2147483648L; +BacklogQuota.RetentionPolicy policy = BacklogQuota.RetentionPolicy.producer_request_hold; +BacklogQuota quota = new BacklogQuota(sizeLimit, policy); +admin.namespaces().setBacklogQuota(namespace, quota); + +``` + +### Get backlog threshold and backlog retention policy + +You can see which size threshold and backlog retention policy has been applied to a namespace. + +#### pulsar-admin + +Use the [`get-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-backlog-quotas) subcommand and specify a namespace. Here's an example: + +```shell + +$ pulsar-admin namespaces get-backlog-quotas my-tenant/my-ns +{ + "destination_storage": { + "limit" : 2147483648, + "policy" : "producer_request_hold" + } +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + +#### Java + +```java + +Map quotas = + admin.namespaces().getBacklogQuotas(namespace); + +``` + +### Remove backlog quotas + +#### pulsar-admin + +Use the [`remove-backlog-quota`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-backlog-quota) subcommand and specify a namespace. Here's an example: + +```shell + +$ pulsar-admin namespaces remove-backlog-quota my-tenant/my-ns + +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + +#### Java + +```java + +admin.namespaces().removeBacklogQuota(namespace); + +``` + +### Clear backlog + +#### pulsar-admin + +Use the [`clear-backlog`](reference-pulsar-admin.md#pulsar-admin-namespaces-clear-backlog) subcommand. + +##### Example + +```shell + +$ pulsar-admin namespaces clear-backlog my-tenant/my-ns + +``` + +By default, you will be prompted to ensure that you really want to clear the backlog for the namespace. You can override the prompt using the `-f`/`--force` flag. + +## Time to live (TTL) + +By default, Pulsar stores all unacknowledged messages forever. This can lead to heavy disk space usage in cases where a lot of messages are going unacknowledged. If disk space is a concern, you can set a time to live (TTL) that determines how long unacknowledged messages will be retained. + +### Set the TTL for a namespace + +#### pulsar-admin + +Use the [`set-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-set-message-ttl) subcommand and specify a namespace and a TTL (in seconds) using the `-ttl`/`--messageTTL` flag. + +##### Example + +```shell + +$ pulsar-admin namespaces set-message-ttl my-tenant/my-ns \ + --messageTTL 120 # TTL of 2 minutes + +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + +#### Java + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, ttlInSeconds); + +``` + +### Get the TTL configuration for a namespace + +#### pulsar-admin + +Use the [`get-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-message-ttl my-tenant/my-ns +60 + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + +#### Java + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + +### Remove the TTL configuration for a namespace + +#### pulsar-admin + +Use the [`remove-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces remove-message-ttl my-tenant/my-ns + +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + +#### Java + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + +## Delete messages from namespaces + +If you do not have any retention period and that you never have much of a backlog, the upper limit for retaining messages, which are acknowledged, equals to the Pulsar segment rollover period + entry log rollover period + (garbage collection interval * garbage collection ratios). + +- **Segment rollover period**: basically, the segment rollover period is how often a new segment is created. Once a new segment is created, the old segment will be deleted. By default, this happens either when you have written 50,000 entries (messages) or have waited 240 minutes. You can tune this in your broker. + +- **Entry log rollover period**: multiple ledgers in BookKeeper are interleaved into an [entry log](https://bookkeeper.apache.org/docs/4.11.1/getting-started/concepts/#entry-logs). In order for a ledger that has been deleted, the entry log must all be rolled over. +The entry log rollover period is configurable, but is purely based on the entry log size. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-settings). Once the entry log is rolled over, the entry log can be garbage collected. + +- **Garbage collection interval**: because entry logs have interleaved ledgers, to free up space, the entry logs need to be rewritten. The garbage collection interval is how often BookKeeper performs garbage collection. which is related to minor compaction and major compaction of entry logs. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-compaction-settings). diff --git a/site2/website/versioned_docs/version-2.8.x/cookbooks-tiered-storage.md b/site2/website/versioned_docs/version-2.8.x/cookbooks-tiered-storage.md new file mode 100644 index 0000000000000..4c86166c7b1ce --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/cookbooks-tiered-storage.md @@ -0,0 +1,342 @@ +--- +id: cookbooks-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: cookbooks-tiered-storage +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be offloaded to long term storage, thereby freeing up space in BookKeeper and reducing storage costs. This cookbook walks you through using tiered storage in your Pulsar cluster. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [Google Cloud Storage](https://cloud.google.com/storage/)(GCS for short) for long term storage. With Jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystem for long term storage. With Hadoop, it is easy to add support for more filesystem in the future. + +## When should I use Tiered Storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm you can rerun it against your full user history. + +## The offloading mechanism + +A topic in Pulsar is backed by a log, known as a managed ledger. This log is composed of an ordered list of segments. Pulsar only every writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a segment oriented architecture. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The Tiered Storage offloading mechanism takes advantage of this segment oriented architecture. When offloading is requested, the segments of the log are copied, one-by-one, to tiered storage. All segments of the log, apart from the segment currently being written to can be offloaded. + +On the broker, the administrator must configure the bucket and credentials for the cloud storage service. +The configured bucket must exist before attempting to offload. If it does not exist, the offload operation will fail. + +Pulsar uses multi-part objects to upload the segment data. It is possible that a broker could crash while uploading the data. +We recommend you add a life cycle rule your bucket to expire incomplete multi-part upload after a day or two to avoid +getting charged for incomplete uploads. + +When ledgers are offloaded to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +## Configuring the offload driver + +Offloading is configured in ```broker.conf```. + +At a minimum, the administrator must configure the driver, the bucket and the authenticating credentials. +There is also some other knobs to configure, like the bucket region, the max block size in backed storage, etc. + +Currently we support driver of types: + +- `aws-s3`: [Simple Cloud Storage Service](https://aws.amazon.com/s3/) +- `google-cloud-storage`: [Google Cloud Storage](https://cloud.google.com/storage/) +- `filesystem`: [Filesystem Storage](http://hadoop.apache.org/) + +> Driver names are case-insensitive for driver's name. There is a third driver type, `s3`, which is identical to `aws-s3`, +> though it requires that you specify an endpoint url using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if +> using a S3 compatible data store, other than AWS. + +```conf + +managedLedgerOffloadDriver=aws-s3 + +``` + +### "aws-s3" Driver configuration + +#### Bucket and Region + +Buckets are the basic containers that hold your data. +Everything that you store in Cloud Storage must be contained in a bucket. +You can use buckets to organize your data and control access to your data, +but unlike directories and folders, you cannot nest buckets. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required +but a recommended configuration. If it is not configured, It will use the default region. + +With AWS S3, the default region is `US East (N. Virginia)`. Page [AWS Regions and Endpoints](https://docs.aws.amazon.com/general/latest/gr/rande.html) contains more information. + +```conf + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication with AWS + +To be able to access AWS S3, you need to authenticate with AWS S3. +Pulsar does not provide any direct means of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, they can be configured in a number of ways. + +1. Using ec2 instance metadata credentials + +If you are on AWS instance with an instance profile that provides credentials, Pulsar will use these credentials +if no other mechanism is provided + +2. Set the environment variables **AWS_ACCESS_KEY_ID** and **AWS_SECRET_ACCESS_KEY** in ```conf/pulsar_env.sh```. + +```bash + +export AWS_ACCESS_KEY_ID=ABC123456789 +export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +> \"export\" is important so that the variables are made available in the environment of spawned processes. + + +3. Add the Java system properties *aws.accessKeyId* and *aws.secretKey* to **PULSAR_EXTRA_OPTS** in `conf/pulsar_env.sh`. + +```bash + +PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacityPerThread=4096" + +``` + +4. Set the access credentials in ```~/.aws/credentials```. + +```conf + +[default] +aws_access_key_id=ABC123456789 +aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +5. Assuming an IAM role + +If you want to assume an IAM role, this can be done via specifying the following: + +```conf + +s3ManagedLedgerOffloadRole= +s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + +``` + +This will use the `DefaultAWSCredentialsProviderChain` for assuming this role. + +> The broker must be rebooted for credentials specified in pulsar_env to take effect. + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to AWS S3. + +- ```s3ManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of + a "part" sent during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```s3ManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for + each individual read when reading back data from AWS S3. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "google-cloud-storage" Driver configuration + +Buckets are the basic containers that hold your data. Everything that you store in +Cloud Storage must be contained in a bucket. You can use buckets to organize your data and +control access to your data, but unlike directories and folders, you cannot nest buckets. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required but +a recommended configuration. If it is not configured, It will use the default region. + +Regarding GCS, buckets are default created in the `us multi-regional location`, +page [Bucket Locations](https://cloud.google.com/storage/docs/bucket-locations) contains more information. + +```conf + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication with GCS + +The administrator needs to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in `broker.conf` +for the broker to be able to access the GCS service. `gcsManagedLedgerOffloadServiceAccountKeyFile` is +a Json file, containing the GCS credentials of a service account. +[Service Accounts section of this page](https://support.google.com/googleapi/answer/6158849) contains +more information of how to create this key file for authentication. More information about google cloud IAM +is available [here](https://cloud.google.com/storage/docs/access-control/iam). + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps: + +1. Open the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). +2. Select a project or create a new one. +3. Click **Create service account**. +4. In the **Create service account** window, type a name for the service account, and select **Furnish a new private key**. If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, also select **Enable G Suite Domain-wide Delegation**. +5. Click **Create**. + +> Notes: Make ensure that the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account in [here](https://cloud.google.com/storage/docs/access-control/iam). + +```conf + +gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/hello/Downloads/project-804d5e6a6f33.json" + +``` + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to GCS. + +- ```gcsManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of a "part" sent + during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```gcsManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for each individual + read when reading back data from GCS. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "filesystem" Driver configuration + + +#### Configure connection address + +You can configure the connection address in the `broker.conf` file. + +```conf + +fileSystemURI="hdfs://127.0.0.1:9000" + +``` + +#### Configure Hadoop profile path + +The configuration file is stored in the Hadoop profile path. It contains various settings, such as base path, authentication, and so on. + +```conf + +fileSystemProfilePath="../conf/filesystem_offload_core_site.xml" + +``` + +The model for storing topic data uses `org.apache.hadoop.io.MapFile`. You can use all of the configurations in `org.apache.hadoop.io.MapFile` for Hadoop. + +**Example** + +```conf + + + fs.defaultFS + + + + + hadoop.tmp.dir + pulsar + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + +``` + +For more information about the configurations in `org.apache.hadoop.io.MapFile`, see [Filesystem Storage](http://hadoop.apache.org/). +## Configuring offload to run automatically + +Namespace policies can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that the topic has stored on the pulsar cluster. Once the topic reaches the threshold, an offload operation will be triggered. Setting a negative value to the threshold will disable automatic offloading. Setting the threshold to 0 will cause the broker to offload data as soon as it possiby can. + +```bash + +$ bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +> Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offload will not until the current segment is full. + +## Configuring read priority for offloaded messages + +By default, once messages were offloaded to long term storage, brokers will read them from long term storage, but messages still exists in bookkeeper for a period depends on the administrator's configuration. For +messages exists in both bookkeeper and long term storage, if they are preferred to read from bookkeeper, you can use command to change this configuration. + +```bash + +# default value for -orp is tiered-storage-first +$ bin/pulsar-admin namespaces set-offload-policies my-tenant/my-namespace -orp bookkeeper-first +$ bin/pulsar-admin topics set-offload-policies my-tenant/my-namespace/topic1 -orp bookkeeper-first + +``` + +## Triggering offload manually + +Offloading can manually triggered through a REST endpoint on the Pulsar broker. We provide a CLI which will call this rest endpoint for you. + +When triggering offload, you must specify the maximum size, in bytes, of backlog which will be retained locally on the bookkeeper. The offload mechanism will offload segments from the start of the topic backlog until this condition is met. + +```bash + +$ bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 +Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + +``` + +The command to triggers an offload will not wait until the offload operation has completed. To check the status of the offload, use offload-status. + +```bash + +$ bin/pulsar-admin topics offload-status my-tenant/my-namespace/topic1 +Offload is currently running + +``` + +To wait for offload to complete, add the -w flag. + +```bash + +$ bin/pulsar-admin topics offload-status -w my-tenant/my-namespace/topic1 +Offload was a success + +``` + +If there is an error offloading, the error will be propagated to the offload-status command. + +```bash + +$ bin/pulsar-admin topics offload-status persistent://public/default/topic1 +Error in offload +null + +Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-aws.md b/site2/website/versioned_docs/version-2.8.x/deploy-aws.md new file mode 100644 index 0000000000000..93c389b56e2cf --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-aws.md @@ -0,0 +1,271 @@ +--- +id: deploy-aws +title: Deploying a Pulsar cluster on AWS using Terraform and Ansible +sidebar_label: "Amazon Web Services" +original_id: deploy-aws +--- + +> For instructions on deploying a single Pulsar cluster manually rather than using Terraform and Ansible, see [Deploying a Pulsar cluster on bare metal](deploy-bare-metal.md). For instructions on manually deploying a multi-cluster Pulsar instance, see [Deploying a Pulsar instance on bare metal](deploy-bare-metal-multi-cluster.md). + +One of the easiest ways to get a Pulsar [cluster](reference-terminology.md#cluster) running on [Amazon Web Services](https://aws.amazon.com/) (AWS) is to use the [Terraform](https://terraform.io) infrastructure provisioning tool and the [Ansible](https://www.ansible.com) server automation tool. Terraform can create the resources necessary for running the Pulsar cluster---[EC2](https://aws.amazon.com/ec2/) instances, networking and security infrastructure, etc.---While Ansible can install and run Pulsar on the provisioned resources. + +## Requirements and setup + +In order to install a Pulsar cluster on AWS using Terraform and Ansible, you need to prepare the following things: + +* An [AWS account](https://aws.amazon.com/account/) and the [`aws`](https://aws.amazon.com/cli/) command-line tool +* Python and [pip](https://pip.pypa.io/en/stable/) +* The [`terraform-inventory`](https://github.com/adammck/terraform-inventory) tool, which enables Ansible to use Terraform artifacts + +You also need to make sure that you are currently logged into your AWS account via the `aws` tool: + +```bash + +$ aws configure + +``` + +## Installation + +You can install Ansible on Linux or macOS using pip. + +```bash + +$ pip install ansible + +``` + +You can install Terraform using the instructions [here](https://learn.hashicorp.com/tutorials/terraform/install-cli). + +You also need to have the Terraform and Ansible configuration for Pulsar locally on your machine. You can find them in the [GitHub repository](https://github.com/apache/pulsar) of Pulsar, which you can fetch using Git commands: + +```bash + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/deployment/terraform-ansible/aws + +``` + +## SSH setup + +> If you already have an SSH key and want to use it, you can skip the step of generating an SSH key and update `private_key_file` setting +> in `ansible.cfg` file and `public_key_path` setting in `terraform.tfvars` file. +> +> For example, if you already have a private SSH key in `~/.ssh/pulsar_aws` and a public key in `~/.ssh/pulsar_aws.pub`, +> follow the steps below: +> +> 1. update `ansible.cfg` with following values: +> + +> ```shell +> +> private_key_file=~/.ssh/pulsar_aws +> +> +> ``` + +> +> 2. update `terraform.tfvars` with following values: +> + +> ```shell +> +> public_key_path=~/.ssh/pulsar_aws.pub +> +> +> ``` + + +In order to create the necessary AWS resources using Terraform, you need to create an SSH key. Enter the following commands to create a private SSH key in `~/.ssh/id_rsa` and a public key in `~/.ssh/id_rsa.pub`: + +```bash + +$ ssh-keygen -t rsa + +``` + +Do *not* enter a passphrase (hit **Enter** instead when the prompt comes out). Enter the following command to verify that a key has been created: + +```bash + +$ ls ~/.ssh +id_rsa id_rsa.pub + +``` + +## Create AWS resources using Terraform + +To start building AWS resources with Terraform, you need to install all Terraform dependencies. Enter the following command: + +```bash + +$ terraform init +# This will create a .terraform folder + +``` + +After that, you can apply the default Terraform configuration by entering this command: + +```bash + +$ terraform apply + +``` + +Then you see this prompt below: + +```bash + +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: + +``` + +Type `yes` and hit **Enter**. Applying the configuration could take several minutes. When the configuration applying finishes, you can see `Apply complete!` along with some other information, including the number of resources created. + +### Apply a non-default configuration + +You can apply a non-default Terraform configuration by changing the values in the `terraform.tfvars` file. The following variables are available: + +Variable name | Description | Default +:-------------|:------------|:------- +`public_key_path` | The path of the public key that you have generated. | `~/.ssh/id_rsa.pub` +`region` | The AWS region in which the Pulsar cluster runs | `us-west-2` +`availability_zone` | The AWS availability zone in which the Pulsar cluster runs | `us-west-2a` +`aws_ami` | The [Amazon Machine Image](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) (AMI) that the cluster uses | `ami-9fa343e7` +`num_zookeeper_nodes` | The number of [ZooKeeper](https://zookeeper.apache.org) nodes in the ZooKeeper cluster | 3 +`num_bookie_nodes` | The number of bookies that runs in the cluster | 3 +`num_broker_nodes` | The number of Pulsar brokers that runs in the cluster | 2 +`num_proxy_nodes` | The number of Pulsar proxies that runs in the cluster | 1 +`base_cidr_block` | The root [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) that network assets uses for the cluster | `10.0.0.0/16` +`instance_types` | The EC2 instance types to be used. This variable is a map with two keys: `zookeeper` for the ZooKeeper instances, `bookie` for the BookKeeper bookies and `broker` and `proxy` for Pulsar brokers and bookies | `t2.small` (ZooKeeper), `i3.xlarge` (BookKeeper) and `c5.2xlarge` (Brokers/Proxies) + +### What is installed + +When you run the Ansible playbook, the following AWS resources are used: + +* 9 total [Elastic Compute Cloud](https://aws.amazon.com/ec2) (EC2) instances running the [ami-9fa343e7](https://access.redhat.com/articles/3135091) Amazon Machine Image (AMI), which runs [Red Hat Enterprise Linux (RHEL) 7.4](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/7.4_release_notes/index). By default, that includes: + * 3 small VMs for ZooKeeper ([t2.small](https://www.ec2instances.info/?selected=t2.small) instances) + * 3 larger VMs for BookKeeper [bookies](reference-terminology.md#bookie) ([i3.xlarge](https://www.ec2instances.info/?selected=i3.xlarge) instances) + * 2 larger VMs for Pulsar [brokers](reference-terminology.md#broker) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) + * 1 larger VMs for Pulsar [proxy](reference-terminology.md#proxy) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) +* An EC2 [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) +* A [virtual private cloud](https://aws.amazon.com/vpc/) (VPC) for security +* An [API Gateway](https://aws.amazon.com/api-gateway/) for connections from the outside world +* A [route table](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Route_Tables.html) for the Pulsar cluster's VPC +* A [subnet](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) for the VPC + +All EC2 instances for the cluster run in the [us-west-2](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) region. + +### Fetch your Pulsar connection URL + +When you apply the Terraform configuration by entering the command `terraform apply`, Terraform outputs a value for the `pulsar_service_url`. The value should look something like this: + +``` + +pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650 + +``` + +You can fetch that value at any time by entering the command `terraform output pulsar_service_url` or parsing the `terraform.tstate` file (which is JSON, even though the filename does not reflect that): + +```bash + +$ cat terraform.tfstate | jq .modules[0].outputs.pulsar_service_url.value + +``` + +### Destroy your cluster + +At any point, you can destroy all AWS resources associated with your cluster using Terraform's `destroy` command: + +```bash + +$ terraform destroy + +``` + +## Setup Disks + +Before you run the Pulsar playbook, you need to mount the disks to the correct directories on those bookie nodes. Since different type of machines have different disk layout, you need to update the task defined in `setup-disk.yaml` file after changing the `instance_types` in your terraform config, + +To setup disks on bookie nodes, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + setup-disk.yaml + +``` + +After that, the disks is mounted under `/mnt/journal` as journal disk, and `/mnt/storage` as ledger disk. +Remember to enter this command just only once. If you attempt to enter this command again after you have run Pulsar playbook, your disks might potentially be erased again, causing the bookies to fail to start up. + +## Run the Pulsar playbook + +Once you have created the necessary AWS resources using Terraform, you can install and run Pulsar on the Terraform-created EC2 instances using Ansible. + +(Optional) If you want to use any [built-in IO connectors](io-connectors.md) , edit the `Download Pulsar IO packages` task in the `deploy-pulsar.yaml` file and uncomment the connectors you want to use. + +To run the playbook, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + ../deploy-pulsar.yaml + +``` + +If you have created a private SSH key at a location different from `~/.ssh/id_rsa`, you can specify the different location using the `--private-key` flag in the following command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + --private-key="~/.ssh/some-non-default-key" \ + ../deploy-pulsar.yaml + +``` + +## Access the cluster + +You can now access your running Pulsar using the unique Pulsar connection URL for your cluster, which you can obtain following the instructions [above](#fetching-your-pulsar-connection-url). + +For a quick demonstration of accessing the cluster, we can use the Python client for Pulsar and the Python shell. First, install the Pulsar Python module using pip: + +```bash + +$ pip install pulsar-client + +``` + +Now, open up the Python shell using the `python` command: + +```bash + +$ python + +``` + +Once you are in the shell, enter the following command: + +```python + +>>> import pulsar +>>> client = pulsar.Client('pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650') +# Make sure to use your connection URL +>>> producer = client.create_producer('persistent://public/default/test-topic') +>>> producer.send('Hello world') +>>> client.close() + +``` + +If all of these commands are successful, Pulsar clients can now use your cluster! diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal-multi-cluster.md b/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal-multi-cluster.md new file mode 100644 index 0000000000000..1b23eea07a20b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal-multi-cluster.md @@ -0,0 +1,486 @@ +--- +id: deploy-bare-metal-multi-cluster +title: Deploying a multi-cluster on bare metal +sidebar_label: "Bare metal multi-cluster" +original_id: deploy-bare-metal-multi-cluster +--- + +:::tip + +1. Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you are interested in experimenting with +Pulsar or using it in a startup or on a single team, you had better opt for a single cluster. For instructions on deploying a single cluster, +see the guide [here](deploy-bare-metal.md). +2. If you want to use all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you need to download `apache-pulsar-io-connectors` +package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you +run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders` +package and install `apache-pulsar-offloaders` under `offloaders` directory in the pulsar directory on every broker node. For more details of how to configure +this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +A Pulsar *instance* consists of multiple Pulsar clusters working in unison. You can distribute clusters across data centers or geographical regions and replicate the clusters amongst themselves using [geo-replication](administration-geo.md). Deploying a multi-cluster Pulsar instance involves the following basic steps: + +* Deploying two separate [ZooKeeper](#deploy-zookeeper) quorums: a [local](#deploy-local-zookeeper) quorum for each cluster in the instance and a [configuration store](#configuration-store) quorum for instance-wide tasks +* Initializing [cluster metadata](#cluster-metadata-initialization) for each cluster +* Deploying a [BookKeeper cluster](#deploy-bookkeeper) of bookies in each Pulsar cluster +* Deploying [brokers](#deploy-brokers) in each Pulsar cluster + +If you want to deploy a single Pulsar cluster, see [Clusters and Brokers](getting-started-standalone.md#start-the-cluster). + +> #### Run Pulsar locally or on Kubernetes? +> This guide shows you how to deploy Pulsar in production in a non-Kubernetes environment. If you want to run a standalone Pulsar cluster on a single machine for development purposes, see the [Setting up a local cluster](getting-started-standalone.md) guide. If you want to run Pulsar on [Kubernetes](https://kubernetes.io), see the [Pulsar on Kubernetes](deploy-kubernetes.md) guide, which includes sections on running Pulsar on Kubernetes on [Google Kubernetes Engine](deploy-kubernetes#pulsar-on-google-kubernetes-engine) and on [Amazon Web Services](deploy-kubernetes#pulsar-on-amazon-web-services). + +## System requirement + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +## Install Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar @pulsar:version@ binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=pulsar/pulsar-@pulsar:version@/apache-pulsar-@pulsar:version@-bin.tar.gz' -O apache-pulsar-@pulsar:version@-bin.tar.gz + + ``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +## What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | [Command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`licenses` | License files, in `.txt` form, for various components of the Pulsar codebase + +The following directories are created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory that ZooKeeper and BookKeeper use +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs that the installation creates + + +## Deploy ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-the-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +The configuration store quorum can be provided by an independent cluster of machines or by the same machines used by local ZooKeeper. + + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +You need to stand up one local ZooKeeper cluster *per Pulsar cluster* for deploying a Pulsar instance. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the ID of the node in the `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command looks like `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy the configuration store + +The ZooKeeper cluster that is configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can +share the same hosts used for the local ZooKeeper quorum. + +For example, assume a Pulsar instance with the following clusters `us-west`, +`us-east`, `us-central`, `eu-central`, `ap-south`. Also assume, each cluster has its own local ZK servers named such as the following: + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario if you want to pick the quorum participants from few clusters and +let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This method guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have the following parameters: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +## Cluster metadata initialization + +Once you set up the cluster-specific ZooKeeper and configuration store quorums for your instance, you need to write some metadata to ZooKeeper for each cluster in your instance. **you only needs to write these metadata once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +As you can see from the example above, you need to specify the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. + +Make sure to run `initialize-cluster-metadata` for each cluster in your instance. + +## Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs to have its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configure bookies + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the local ZooKeeper of Pulsar cluster. + +### Start bookies + +You can start a bookie in two ways: in the foreground or as a background daemon. + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify that the bookie works properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates a new ledger on the local bookie, writes a few entries, reads them back and finally deletes the ledger. + +After you have started all bookies, you can use the `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify that all bookies in the cluster are running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, having a suitable hardware configuration is essential for the bookies. The following are key dimensions for bookie hardware capacity. + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, having fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts is critical. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID)s controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers acknowledge the message. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + + + +## Deploy brokers + +Once you set up ZooKeeper, initialize cluster metadata, and spin up BookKeeper bookies, you can deploy brokers. + +### Broker configuration + +You can configure brokers using the [`conf/broker.conf`](reference-configuration.md#broker) configuration file. + +The most important element of broker configuration is ensuring that each broker is aware of its local ZooKeeper quorum as well as the configuration store quorum. Make sure that you set the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) parameter to reflect the local quorum and the [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameter to reflect the configuration store quorum (although you need to specify only those ZooKeeper servers located in the same cluster). + +You also need to specify the name of the [cluster](reference-terminology.md#cluster) to which the broker belongs using the [`clusterName`](reference-configuration.md#broker-clusterName) parameter. In addition, you need to match the broker and web service ports provided when you initialize the metadata (especially when you use a different port from default) of the cluster. + +The following is an example configuration: + +```properties + +# Local ZooKeeper servers +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Configuration store quorum connection string. +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +clusterName=us-west + +# Broker data port +brokerServicePort=6650 + +# Broker data port for TLS +brokerServicePortTls=6651 + +# Port to use to server HTTP request +webServicePort=8080 + +# Port to use to server HTTPS request +webServicePortTls=8443 + +``` + +### Broker hardware + +Pulsar brokers do not require any special hardware since they do not use the local disk. You had better choose fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) so that the software can take full advantage of that. + +### Start the broker service + +You can start a broker in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start broker + +``` + +You can also start brokers in the foreground by using [`pulsar broker`](reference-cli-tools.md#broker): + +```shell + +$ bin/pulsar broker + +``` + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. Pulsar provides a built-in service discovery mechanism that you can set up using the instructions [immediately below](#service-discovery-setup). + +You can also use your own service discovery system if you want. If you use your own system, you only need to satisfy just one requirement: when a client performs an HTTP request to an [endpoint](reference-configuration.md) for a Pulsar cluster, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +> #### Service discovery already provided by many scheduling systems +> Many large-scale deployment systems, such as [Kubernetes](deploy-kubernetes.md), have service discovery systems built in. If you run Pulsar on such a system, you may not need to provide your own service discovery mechanism. + + +### Service discovery setup + +The service discovery mechanism that included with Pulsar maintains a list of active brokers, which stored in ZooKeeper, and supports lookup using HTTP and also the [binary protocol](developing-binary-protocol.md) of Pulsar. + +To get started setting up the built-in service of discovery of Pulsar, you need to change a few parameters in the [`conf/discovery.conf`](reference-configuration.md#service-discovery) configuration file. Set the [`zookeeperServers`](reference-configuration.md#service-discovery-zookeeperServers) parameter to the ZooKeeper quorum connection string of the cluster and the [`configurationStoreServers`](reference-configuration.md#service-discovery-configurationStoreServers) setting to the [configuration +store](reference-terminology.md#configuration-store) quorum connection string. + +```properties + +# Zookeeper quorum connection string +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Global configuration store connection string +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +``` + +To start the discovery service: + +```shell + +$ bin/pulsar-daemon start discovery + +``` + +## Admin client and verification + +At this point your Pulsar instance should be ready to use. You can now configure client machines that can serve as [administrative clients](admin-api-overview.md) for each cluster. You can use the [`conf/client.conf`](reference-configuration.md#client) configuration file to configure admin clients. + +The most important thing is that you point the [`serviceUrl`](reference-configuration.md#client-serviceUrl) parameter to the correct service URL for the cluster: + +```properties + +serviceUrl=http://pulsar.us-west.example.com:8080/ + +``` + +## Provision new tenants + +Pulsar is built as a fundamentally multi-tenant system. + + +If a new tenant wants to use the system, you need to create a new one. You can create a new tenant by using the [`pulsar-admin`](reference-pulsar-admin.md#tenants) CLI tool: + +```shell + +$ bin/pulsar-admin tenants create test-tenant \ + --allowed-clusters us-west \ + --admin-roles test-admin-role + +``` + +In this command, users who identify with `test-admin-role` role can administer the configuration for the `test-tenant` tenant. The `test-tenant` tenant can only use the `us-west` cluster. From now on, this tenant can manage its resources. + +Once you create a tenant, you need to create [namespaces](reference-terminology.md#namespace) for topics within that tenant. + + +The first step is to create a namespace. A namespace is an administrative unit that can contain many topics. A common practice is to create a namespace for each different use case from a single tenant. + +```shell + +$ bin/pulsar-admin namespaces create test-tenant/ns1 + +``` + +##### Test producer and consumer + + +Everything is now ready to send and receive messages. The quickest way to test the system is through the [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) client tool. + + +You can use a topic in the namespace that you have just created. Topics are automatically created the first time when a producer or a consumer tries to use them. + +The topic name in this case could be: + +```http + +persistent://test-tenant/ns1/my-topic + +``` + +Start a consumer that creates a subscription on the topic and waits for messages: + +```shell + +$ bin/pulsar-perf consume persistent://test-tenant/ns1/my-topic + +``` + +Start a producer that publishes messages at a fixed rate and reports stats every 10 seconds: + +```shell + +$ bin/pulsar-perf produce persistent://test-tenant/ns1/my-topic + +``` + +To report the topic stats: + +```shell + +$ bin/pulsar-admin topics stats persistent://test-tenant/ns1/my-topic + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal.md b/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal.md new file mode 100644 index 0000000000000..bdd05f24f2566 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-bare-metal.md @@ -0,0 +1,541 @@ +--- +id: deploy-bare-metal +title: Deploy a cluster on bare metal +sidebar_label: "Bare metal" +original_id: deploy-bare-metal +--- + +:::tip + +1. Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you are interested in experimenting with +Pulsar or using Pulsar in a startup or on a single team, it is simplest to opt for a single cluster. If you do need to run a multi-cluster Pulsar instance, +see the guide [here](deploy-bare-metal-multi-cluster.md). +2. If you want to use all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you need to download `apache-pulsar-io-connectors` +package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you +have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders` +package and install `apache-pulsar-offloaders` under `offloaders` directory in the pulsar directory on every broker node. For more details of how to configure +this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +Deploying a Pulsar cluster involves doing the following (in order): + +* Deploy a [ZooKeeper](#deploy-a-zookeeper-cluster) cluster (optional) +* Initialize [cluster metadata](#initialize-cluster-metadata) +* Deploy a [BookKeeper](#deploy-a-bookkeeper-cluster) cluster +* Deploy one or more Pulsar [brokers](#deploy-pulsar-brokers) + +## Preparation + +### Requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +> If you already have an existing ZooKeeper cluster and want to reuse it, you do not need to prepare the machines +> for running ZooKeeper. + +To run Pulsar on bare metal, the following configuration is recommended: + +* At least 6 Linux machines or VMs + * 3 for running [ZooKeeper](https://zookeeper.apache.org) + * 3 for running a Pulsar broker, and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +> If you do not have enough machines, or to try out Pulsar in cluster mode (and expand the cluster later), +> you can deploy a full Pulsar configuration on one node, where Zookeeper, the bookie and broker are run on the same machine. + +> If you do not have a DNS server, you can use the multi-host format in the service URL instead. + +Each machine in your cluster needs to have [Java 8](https://adoptium.net/?variant=openjdk8) or [Java 11](https://adoptium.net/?variant=openjdk11) installed. + +The following is a diagram showing the basic setup: + +![alt-text](/assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to be able to communicate with the Pulsar cluster using a single URL. In this case, `pulsar-cluster.acme.com` abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +When you deploy a Pulsar cluster, keep in mind the following basic better choices when you do the capacity planning. + +#### ZooKeeper + +For machines running ZooKeeper, is is recommended to use less powerful machines or VMs. Pulsar uses ZooKeeper only for periodic coordination-related and configuration-related tasks, *not* for basic operations. If you run Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance might likely suffice. + +#### Bookies and Brokers + +For machines running a bookie and a Pulsar broker, more powerful machines are required. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines you can use the following: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +## Install the Pulsar binary package + +> You need to install the Pulsar binary package on *each machine in the cluster*, including machines running [ZooKeeper](#deploy-a-zookeeper-cluster) and [BookKeeper](#deploy-a-bookkeeper-cluster). + +To get started deploying a Pulsar cluster on bare metal, you need to download a binary tarball release in one of the following ways: + +* By clicking on the link below directly, which automatically triggers a download: + * Pulsar @pulsar:version@ binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) on [GitHub](https://github.com) +* Using [wget](https://www.gnu.org/software/wget): + +```bash + +$ wget pulsar:binary_release_url + +``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvzf apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +The extracted directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` |[command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory that ZooKeeper and BookKeeper use +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`logs` | Logs that the installation creates + +## [Install Builtin Connectors (optional)]( https://pulsar.apache.org/docs/en/next/standalone/#install-builtin-connectors-optional) + +> Since Pulsar release `2.1.0-incubating`, Pulsar provides a separate binary distribution, containing all the `builtin` connectors. +> If you want to enable those `builtin` connectors, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using builtin connectors, you need to download the connectors tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar IO Connectors @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +Once you download the .nar file, copy the file to directory `connectors` in the pulsar directory. +For example, if you download the connector file `pulsar-io-aerospike-@pulsar:version@.nar`: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +## [Install Tiered Storage Offloaders (optional)](https://pulsar.apache.org/docs/en/next/standalone/#install-tiered-storage-offloaders-optional) + +> Since Pulsar release `2.2.0`, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> If you want to enable tiered storage feature, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using tiered storage offloaders, you need to download the offloaders tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +Once you download the tarball, in the pulsar directory, untar the offloaders package and copy the offloaders as `offloaders` in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you can find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more details of how to configure tiered storage feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md) + + +## Deploy a ZooKeeper cluster + +> If you already have an existing zookeeper cluster and want to use it, you can skip this section. + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination- and configuration-related tasks for Pulsar. To deploy a Pulsar cluster, you need to deploy ZooKeeper first (before all other components). A 3-node ZooKeeper cluster is the recommended configuration. Pulsar does not make heavy use of ZooKeeper, so more lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory that you create [above](#install-the-pulsar-binary-package)). The following is an example: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> If you only have one machine on which to deploy Pulsar, you only need to add one server entry in the configuration file. + +On each host, you need to specify the ID of the node in the `myid` file, which is in the `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +For example, on a ZooKeeper server like `zk1.us-west.example.com`, you can set the `myid` value as follows: + +```bash + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com`, the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and have the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start zookeeper + +``` + +> If you plan to deploy Zookeeper with the Bookie on the same node, you need to start zookeeper by using different stats +> port by configuring the `metricsProvider.httpPort` in zookeeper.conf. + +## Initialize cluster metadata + +Once you deploy ZooKeeper for your cluster, you need to write some metadata to ZooKeeper. You only need to write this data **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your Pulsar cluster, so the metadata can be initialized from a ZooKeeper, broker, or bookie machine. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2181 \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +As you can see from the example above, you will need to specify the following: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--zookeeper` | A "local" ZooKeeper connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-store` | The configuration store connection string for the entire instance. As with the `--zookeeper` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (you had better not use a different port). +`--web-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster. The default port is 8443 (you had better not use a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should not use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (you had better not use a different port). +`--broker-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (you had better not use a different port). + + +> If you do not have a DNS server, you can use multi-host format in the service URL with the following settings: +> + +> ```properties +> +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> +> If you want to use an existing BookKeeper cluster, you can add the `--existing-bk-metadata-service-uri` flag as follows: +> + +> ```properties +> +> --existing-bk-metadata-service-uri "zk+null://zk1:2181;zk2:2181/ledgers" \ +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> You can obtain the metadata service URI of the existing BookKeeper cluster by using the `bin/bookkeeper shell whatisinstanceid` command. You must enclose the value in double quotes since the multiple metadata service URIs are separated with semicolons. + +## Deploy a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You need to deploy a cluster of BookKeeper bookies to use Pulsar. You can choose to run a **3-bookie BookKeeper cluster**. + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. The following is an example: + +```properties + +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +Once you appropriately modify the `zkServers` parameter, you can make any other configuration changes that you require. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper). However, consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide might be a better choice. + +Once you apply the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +To start the bookie in the foreground: + +```bash + +$ bin/pulsar bookie + +``` + +You can verify that a bookie works properly by running the `bookiesanity` command on the [BookKeeper shell](reference-cli-tools.md#shell): + +```bash + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates an ephemeral BookKeeper ledger on the local bookie, writes a few entries, reads them back, and finally deletes the ledger. + +After you start all the bookies, you can use `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify all the bookies in the cluster are up running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +This command creates a `num-bookies` sized ledger on the cluster, writes a few entries, and finally deletes the ledger. + + +## Deploy Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide the administrative interface of Pulsar. A good choice is to run **3 brokers**, one for each machine that already runs a BookKeeper bookie. + +### Configure Brokers + +The most important element of broker configuration is ensuring that each broker is aware of the ZooKeeper cluster that you have deployed. Ensure that the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) and [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameters are correct. In this case, since you only have 1 cluster and no configuration store setup, the `configurationStoreServers` point to the same `zookeeperServers`. + +```properties + +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationStoreServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +You also need to specify the cluster name (matching the name that you provided when you [initialize the metadata of the cluster](#initialize-cluster-metadata)): + +```properties + +clusterName=pulsar-cluster-1 + +``` + +In addition, you need to match the broker and web service ports provided when you initialize the metadata of the cluster (especially when you use a different port than the default): + +```properties + +brokerServicePort=6650 +brokerServicePortTls=6651 +webServicePort=8080 +webServicePortTls=8443 + +``` + +> If you deploy Pulsar in a one-node cluster, you should update the replication settings in `conf/broker.conf` to `1`. +> + +> ```properties +> +> # Number of bookies to use when creating a ledger +> managedLedgerDefaultEnsembleSize=1 +> +> # Number of copies to store for each message +> managedLedgerDefaultWriteQuorum=1 +> +> # Number of guaranteed copies (acks to wait before write is complete) +> managedLedgerDefaultAckQuorum=1 +> +> +> ``` + + +### Enable Pulsar Functions (optional) + +If you want to enable [Pulsar Functions](functions-overview.md), you can follow the instructions as below: + +1. Edit `conf/broker.conf` to enable functions worker, by setting `functionsWorkerEnabled` to `true`. + + ```conf + + functionsWorkerEnabled=true + + ``` + +2. Edit `conf/functions_worker.yml` and set `pulsarFunctionsCluster` to the cluster name that you provide when you [initialize the metadata of the cluster](#initialize-cluster-metadata). + + ```conf + + pulsarFunctionsCluster: pulsar-cluster-1 + + ``` + +If you want to learn more options about deploying the functions worker, check out [Deploy and manage functions worker](functions-worker.md). + +### Start Brokers + +You can then provide any other configuration changes that you want in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you decide on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, you can start brokers either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash + +$ bin/pulsar broker + +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start broker + +``` + +Once you successfully start up all the brokers that you intend to use, your Pulsar cluster should be ready to go! + +## Connect to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provide a simple way to make sure that your cluster runs properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you assign to your broker/bookie hosts. The following is an example: + +```properties + +webServiceUrl=http://us-west.example.com:8080 +brokerServiceurl=pulsar://us-west.example.com:6650 + +``` + +> If you do not have a DNS server, you can specify multi-host in service URL as follows: +> + +> ```properties +> +> webServiceUrl=http://host1:8080,host2:8080,host3:8080 +> brokerServiceurl=pulsar://host1:6650,host2:6650,host3:6650 +> +> +> ``` + + +Once that is complete, you can publish a message to the Pulsar topic: + +```bash + +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello Pulsar" + +``` + +> You may need to use a different cluster name in the topic if you specify a cluster name other than `pulsar-cluster-1`. + +This command publishes a single message to the Pulsar topic. In addition, you can subscribe to the Pulsar topic in a different terminal before publishing messages as below: + +```bash + +$ bin/pulsar-client consume \ + persistent://public/default/test \ + -n 100 \ + -s "consumer-test" \ + -t "Exclusive" + +``` + +Once you successfully publish the above message to the topic, you should see it in the standard output: + +```bash + +----- got message ----- +Hello Pulsar + +``` + +## Run Functions + +> If you have [enabled](#enable-pulsar-functions-optional) Pulsar Functions, you can try out the Pulsar Functions now. + +Create an ExclamationFunction `exclamation`. + +```bash + +bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation + +``` + +Check whether the function runs as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. + +```bash + +bin/pulsar-admin functions trigger --name exclamation --trigger-value "hello world" + +``` + +You should see the following output: + +```shell + +hello world! + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-dcos.md b/site2/website/versioned_docs/version-2.8.x/deploy-dcos.md new file mode 100644 index 0000000000000..952d5f47e30fa --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-dcos.md @@ -0,0 +1,200 @@ +--- +id: deploy-dcos +title: Deploy Pulsar on DC/OS +sidebar_label: "DC/OS" +original_id: deploy-dcos +--- + +:::tip + +If you want to enable all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you can choose to use `apachepulsar/pulsar-all` image instead of +`apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +[DC/OS](https://dcos.io/) (the DataCenter Operating System) is a distributed operating system used for deploying and managing applications and systems on [Apache Mesos](http://mesos.apache.org/). DC/OS is an open-source tool that [Mesosphere](https://mesosphere.com/) creates and maintains . + +Apache Pulsar is available as a [Marathon Application Group](https://mesosphere.github.io/marathon/docs/application-groups.html), which runs multiple applications as manageable sets. + +## Prerequisites + +In order to run Pulsar on DC/OS, you need the following: + +* DC/OS version [1.9](https://docs.mesosphere.com/1.9/) or higher +* A [DC/OS cluster](https://docs.mesosphere.com/1.9/installing/) with at least three agent nodes +* The [DC/OS CLI tool](https://docs.mesosphere.com/1.9/cli/install/) installed +* The [`PulsarGroups.json`](https://github.com/apache/pulsar/blob/master/deployment/dcos/PulsarGroups.json) configuration file from the Pulsar GitHub repo. + + ```bash + + $ curl -O https://raw.githubusercontent.com/apache/pulsar/master/deployment/dcos/PulsarGroups.json + + ``` + +Each node in the DC/OS-managed Mesos cluster must have at least: + +* 4 CPU +* 4 GB of memory +* 60 GB of total persistent disk + +Alternatively, you can change the configuration in `PulsarGroups.json` according to match your resources of DC/OS cluster. + +## Deploy Pulsar using the DC/OS command interface + +You can deploy Pulsar on DC/OS using this command: + +```bash + +$ dcos marathon group add PulsarGroups.json + +``` + +This command deploys Docker container instances in three groups, which together comprise a Pulsar cluster: + +* 3 bookies (1 [bookie](reference-terminology.md#bookie) on each agent node and 1 [bookie recovery](http://bookkeeper.apache.org/docs/latest/admin/autorecovery/) instance) +* 3 Pulsar [brokers](reference-terminology.md#broker) (1 broker on each node and 1 admin instance) +* 1 [Prometheus](http://prometheus.io/) instance and 1 [Grafana](https://grafana.com/) instance + + +> When you run DC/OS, a ZooKeeper cluster already runs at `master.mesos:2181`, thus you do not have to install or start up ZooKeeper separately. + +After executing the `dcos` command above, click on the **Services** tab in the DC/OS [GUI interface](https://docs.mesosphere.com/latest/gui/), which you can access at [http://m1.dcos](http://m1.dcos) in this example. You should see several applications in the process of deploying. + +![DC/OS command executed](/assets/dcos_command_execute.png) + +![DC/OS command executed2](/assets/dcos_command_execute2.png) + +## The BookKeeper group + +To monitor the status of the BookKeeper cluster deployment, click on the **bookkeeper** group in the parent **pulsar** group. + +![DC/OS bookkeeper status](/assets/dcos_bookkeeper_status.png) + +At this point, 3 [bookies](reference-terminology.md#bookie) should be shown as green, which means that the bookies have been deployed successfully and are now running. + +![DC/OS bookkeeper running](/assets/dcos_bookkeeper_run.png) + +You can also click into each bookie instance to get more detailed information, such as the bookie running log. + +![DC/OS bookie log](/assets/dcos_bookie_log.png) + +To display information about the BookKeeper in ZooKeeper, you can visit [http://m1.dcos/exhibitor](http://m1.dcos/exhibitor). In this example, 3 bookies are under the `available` directory. + +![DC/OS bookkeeper in zk](/assets/dcos_bookkeeper_in_zookeeper.png) + +## The Pulsar broker Group + +Similar to the BookKeeper group above, click into the **brokers** to check the status of the Pulsar brokers. + +![DC/OS broker status](/assets/dcos_broker_status.png) + +![DC/OS broker running](/assets/dcos_broker_run.png) + +You can also click into each broker instance to get more detailed information, such as the broker running log. + +![DC/OS broker log](/assets/dcos_broker_log.png) + +Broker cluster information in Zookeeper is also available through the web UI. In this example, you can see that the `loadbalance` and `managed-ledgers` directories have been created. + +![DC/OS broker in zk](/assets/dcos_broker_in_zookeeper.png) + +## Monitor Group + +The **monitory** group consists of Prometheus and Grafana. + +![DC/OS monitor status](/assets/dcos_monitor_status.png) + +### Prometheus + +Click into the instance of `prom` to get the endpoint of Prometheus, which is `192.168.65.121:9090` in this example. + +![DC/OS prom endpoint](/assets/dcos_prom_endpoint.png) + +If you click that endpoint, you can see the Prometheus dashboard. The [http://192.168.65.121:9090/targets](http://192.168.65.121:9090/targets) URL display all the bookies and brokers. + +![DC/OS prom targets](/assets/dcos_prom_targets.png) + +### Grafana + +Click into `grafana` to get the endpoint for Grafana, which is `192.168.65.121:3000` in this example. + +![DC/OS grafana endpoint](/assets/dcos_grafana_endpoint.png) + +If you click that endpoint, you can access the Grafana dashboard. + +![DC/OS grafana targets](/assets/dcos_grafana_dashboard.png) + +## Run a simple Pulsar consumer and producer on DC/OS + +Now that you have a fully deployed Pulsar cluster, you can run a simple consumer and producer to show Pulsar on DC/OS in action. + +### Download and prepare the Pulsar Java tutorial + +You can clone a [Pulsar Java tutorial](https://github.com/streamlio/pulsar-java-tutorial) repo. This repo contains a simple Pulsar consumer and producer (you can find more information in the `README` file of the repo). + +```bash + +$ git clone https://github.com/streamlio/pulsar-java-tutorial + +``` + +Change the `SERVICE_URL` from `pulsar://localhost:6650` to `pulsar://a1.dcos:6650` in both [`ConsumerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ConsumerTutorial.java) and [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java). +The `pulsar://a1.dcos:6650` endpoint is for the broker service. You can fetch the endpoint details for each broker instance from the DC/OS GUI. `a1.dcos` is a DC/OS client agent, which runs a broker. The client agent IP address can also replace this. + +Now, change the message number from 10 to 10000000 in main method of [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) so that it can produce more messages. + +Now compile the project code using the command below: + +```bash + +$ mvn clean package + +``` + +### Run the consumer and producer + +Execute this command to run the consumer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ConsumerTutorial" + +``` + +Execute this command to run the producer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ProducerTutorial" + +``` + +You can see the producer producing messages and the consumer consuming messages through the DC/OS GUI. + +![DC/OS pulsar producer](/assets/dcos_producer.png) + +![DC/OS pulsar consumer](/assets/dcos_consumer.png) + +### View Grafana metric output + +While the producer and consumer run, you can access running metrics information from Grafana. + +![DC/OS pulsar dashboard](/assets/dcos_metrics.png) + + +## Uninstall Pulsar + +You can shut down and uninstall the `pulsar` application from DC/OS at any time in the following two ways: + +1. Using the DC/OS GUI, you can choose **Delete** at the right end of Pulsar group. + + ![DC/OS pulsar uninstall](/assets/dcos_uninstall.png) + +2. You can use the following command: + + ```bash + + $ dcos marathon group remove /pulsar + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-docker.md b/site2/website/versioned_docs/version-2.8.x/deploy-docker.md new file mode 100644 index 0000000000000..8348d78deb237 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-docker.md @@ -0,0 +1,60 @@ +--- +id: deploy-docker +title: Deploy a cluster on Docker +sidebar_label: "Docker" +original_id: deploy-docker +--- + +To deploy a Pulsar cluster on Docker, complete the following steps: +1. Deploy a ZooKeeper cluster (optional) +2. Initialize cluster metadata +3. Deploy a BookKeeper cluster +4. Deploy one or more Pulsar brokers + +## Prepare + +To run Pulsar on Docker, you need to create a container for each Pulsar component: ZooKeeper, BookKeeper and broker. You can pull the images of ZooKeeper and BookKeeper separately on [Docker Hub](https://hub.docker.com/), and pull a [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) for the broker. You can also pull only one [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) and create three containers with this image. This tutorial takes the second option as an example. + +### Pull a Pulsar image +You can pull a Pulsar image from [Docker Hub](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) with the following command. + +``` + +docker pull apachepulsar/pulsar-all:latest + +``` + +### Create three containers +Create containers for ZooKeeper, BookKeeper and broker. In this example, they are named as `zookeeper`, `bookkeeper` and `broker` respectively. You can name them as you want with the `--name` flag. By default, the container names are created randomly. + +``` + +docker run -it --name bookkeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name zookeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name broker apachepulsar/pulsar-all:latest /bin/bash + +``` + +### Create a network +To deploy a Pulsar cluster on Docker, you need to create a `network` and connect the containers of ZooKeeper, BookKeeper and broker to this network. The following command creates the network `pulsar`: + +``` + +docker network create pulsar + +``` + +### Connect containers to network +Connect the containers of ZooKeeper, BookKeeper and broker to the `pulsar` network with the following commands. + +``` + +docker network connect pulsar zookeeper +docker network connect pulsar bookkeeper +docker network connect pulsar broker + +``` + +To check whether the containers are successfully connected to the network, enter the `docker network inspect pulsar` command. + +For detailed information about how to deploy ZooKeeper cluster, BookKeeper cluster, brokers, see [deploy a cluster on bare metal](deploy-bare-metal.md). diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-kubernetes.md b/site2/website/versioned_docs/version-2.8.x/deploy-kubernetes.md new file mode 100644 index 0000000000000..1aefc6ad79f71 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-kubernetes.md @@ -0,0 +1,11 @@ +--- +id: deploy-kubernetes +title: Deploy Pulsar on Kubernetes +sidebar_label: "Kubernetes" +original_id: deploy-kubernetes +--- + +To get up and running with these charts as fast as possible, in a **non-production** use case, we provide +a [quick start guide](getting-started-helm.md) for Proof of Concept (PoC) deployments. + +To configure and install a Pulsar cluster on Kubernetes for production usage, follow the complete [Installation Guide](helm-install.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/deploy-monitoring.md b/site2/website/versioned_docs/version-2.8.x/deploy-monitoring.md new file mode 100644 index 0000000000000..f9fe0e0bb97be --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/deploy-monitoring.md @@ -0,0 +1,148 @@ +--- +id: deploy-monitoring +title: Monitor +sidebar_label: "Monitor" +original_id: deploy-monitoring +--- + +You can use different ways to monitor a Pulsar cluster, exposing both metrics related to the usage of topics and the overall health of the individual components of the cluster. + +## Collect metrics + +You can collect broker stats, ZooKeeper stats, and BookKeeper stats. + +### Broker stats + +You can collect Pulsar broker metrics from brokers and export the metrics in JSON format. The Pulsar broker metrics mainly have two types: + +* *Destination dumps*, which contain stats for each individual topic. You can fetch the destination dumps using the command below: + + ```shell + + bin/pulsar-admin broker-stats destinations + + ``` + +* Broker metrics, which contain the broker information and topics stats aggregated at namespace level. You can fetch the broker metrics by using the following command: + + ```shell + + bin/pulsar-admin broker-stats monitoring-metrics + + ``` + +All the message rates are updated every minute. + +The aggregated broker metrics are also exposed in the [Prometheus](https://prometheus.io) format at: + +```shell + +http://$BROKER_ADDRESS:8080/metrics/ + +``` + +### ZooKeeper stats + +The local ZooKeeper, configuration store server and clients that are shipped with Pulsar can expose detailed stats through Prometheus. + +```shell + +http://$LOCAL_ZK_SERVER:8000/metrics +http://$GLOBAL_ZK_SERVER:8001/metrics + +``` + +The default port of local ZooKeeper is `8000` and the default port of the configuration store is `8001`. You can use a different stats port by configuring `metricsProvider.httpPort` in the `conf/zookeeper.conf` file. + +### BookKeeper stats + +You can configure the stats frameworks for BookKeeper by modifying the `statsProviderClass` in the `conf/bookkeeper.conf` file. + +The default BookKeeper configuration enables the Prometheus exporter. The configuration is included with Pulsar distribution. + +```shell + +http://$BOOKIE_ADDRESS:8000/metrics + +``` + +The default port for bookie is `8000`. You can change the port by configuring `prometheusStatsHttpPort` in the `conf/bookkeeper.conf` file. + +### Managed cursor acknowledgment state +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgement, you can configure the metrics for the managed cursor. + +``` + +brk_ml_cursor_persistLedgerSucceed(namespace=", ledger_name="", cursor_name:") +brk_ml_cursor_persistLedgerErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperSucceed(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_nonContiguousDeletedMessagesRange(namespace="", ledger_name="", cursor_name:"") + +``` + +Those metrics are added in the Prometheus interface, you can monitor and check the metrics stats in the Grafana. + +### Function and connector stats + +You can collect functions worker stats from `functions-worker` and export the metrics in JSON formats, which contain functions worker JVM metrics. + +``` + +pulsar-admin functions-worker monitoring-metrics + +``` + +You can collect functions and connectors metrics from `functions-worker` and export the metrics in JSON formats. + +``` + +pulsar-admin functions-worker function-stats + +``` + +The aggregated functions and connectors metrics can be exposed in Prometheus formats as below. You can get [`FUNCTIONS_WORKER_ADDRESS`](http://pulsar.apache.org/docs/en/next/functions-worker/) and `WORKER_PORT` from the `functions_worker.yml` file. + +``` + +http://$FUNCTIONS_WORKER_ADDRESS:$WORKER_PORT/metrics: + +``` + +## Configure Prometheus + +You can use Prometheus to collect all the metrics exposed for Pulsar components and set up [Grafana](https://grafana.com/) dashboards to display the metrics and monitor your Pulsar cluster. For details, refer to [Prometheus guide](https://prometheus.io/docs/introduction/getting_started/). + +When you run Pulsar on bare metal, you can provide the list of nodes to be probed. When you deploy Pulsar in a Kubernetes cluster, the monitoring is setup automatically. For details, refer to [Kubernetes instructions](helm-deploy.md). + +## Dashboards + +When you collect time series statistics, the major problem is to make sure the number of dimensions attached to the data does not explode. Thus you only need to collect time series of metrics aggregated at the namespace level. + +### Pulsar per-topic dashboard + +The per-topic dashboard instructions are available at [Pulsar manager](administration-pulsar-manager.md). + +### Grafana + +You can use grafana to create dashboard driven by the data that is stored in Prometheus. + +When you deploy Pulsar on Kubernetes, a `pulsar-grafana` Docker image is enabled by default. You can use the docker image with the principal dashboards. + +Enter the command below to use the dashboard manually: + +```shell + +docker run -p3000:3000 \ + -e PROMETHEUS_URL=http://$PROMETHEUS_HOST:9090/ \ + apachepulsar/pulsar-grafana:latest + +``` + +The following are some Grafana dashboards examples: + +- [pulsar-grafana](http://pulsar.apache.org/docs/en/deploy-monitoring/#grafana): a Grafana dashboard that displays metrics collected in Prometheus for Pulsar clusters running on Kubernetes. +- [apache-pulsar-grafana-dashboard](https://github.com/streamnative/apache-pulsar-grafana-dashboard): a collection of Grafana dashboard templates for different Pulsar components running on both Kubernetes and on-premise machines. + +## Alerting rules +You can set alerting rules according to your Pulsar environment. To configure alerting rules for Apache Pulsar, refer to [alerting rules](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). diff --git a/site2/website/versioned_docs/version-2.8.x/develop-binary-protocol.md b/site2/website/versioned_docs/version-2.8.x/develop-binary-protocol.md new file mode 100644 index 0000000000000..74ef751e64324 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/develop-binary-protocol.md @@ -0,0 +1,581 @@ +--- +id: develop-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: "Binary protocol" +original_id: develop-binary-protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a [`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:------------|:----------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:-------------|:--------------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| magicNumber | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| checksum | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| metadataSize | The size of the message [metadata](#message-metadata) | 4 | +| metadata | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| payload | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed on unchanged to the consumer. + +| Field | Description | +|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | The name of the producer that published the message | +| `sequence_id` | The sequence ID of the message, assigned by producer | +| `publish_time` | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` *(optional)* | Indicates that the message has been replicated and specifies the name of the [cluster](reference-terminology.md#cluster) where the message was originally published | +| `partition_key` *(optional)* | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose. Partition key is used as the message key. | +| `compression` *(optional)* | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` *(optional)* | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` *(optional)* | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Description | +|:--------------|:------------------------------------------------------------| +| metadataSizeN | The size of the single message metadata serialized Protobuf | +| metadataN | Single message metadata | +| payloadN | Message payload passed by application | + +Each metadata field looks like this; + +| Field | Description | +|:---------------------------|:--------------------------------------------------------| +| properties | Application-defined properties | +| partition key *(optional)* | Key to indicate the hashing to a particular partition | +| payload_size | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf + +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} + +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf + +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} + +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer id negotiated before. + +![Producer interaction](/assets/binary-protocol-producer.png) + +##### Command Producer + +```protobuf + +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf + +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} + +``` + +Parameters: + * `request_id` → Original id of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. This command is used in a frame that includes command +as well as message payload, for which the complete format is specified in the [payload commands](#payload-commands) section. + +```protobuf + +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} + +``` + +Parameters: + * `producer_id` → id of an existing producer + * `sequence_id` → each message has an associated sequence id which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of a messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + +```protobuf + +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `producer_id` → id of producer originating the send request + * `sequence_id` → sequence id of the published message + * `message_id` → message id assigned by the system to the published message + Unique within a single cluster. Message id is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique id is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/assets/binary-protocol-consumer.png) + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued half of the messages in the queue, the consumer +sends permits to the broker to ask for more messages (equals to half of the messages in the queue). + +For example, if the queue size is 1000 and the consumer consumes 500 messages in the queue. +Then the consumer sends permits to the broker to ask for 500 messages. + +##### Command Subscribe + +```protobuf + +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover, Key_Shared + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf + +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} + +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf + +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf + +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + +##### Command CloseConsumer + +***Note***: **This command can be sent by either producer or broker*. + +This command behaves the same as [`CloseProducer`](#command-closeproducer) + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retrieve Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the [admin API](admin-api-topics.md#lookup-of-topic) +docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/assets/binary-protocol-topic-lookup.png) + +```protobuf + +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} + +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} + +``` + +Example of lookup response with redirection: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} + +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf + +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} + +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf + +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} + +``` + +## Protobuf interface + +All Pulsar's Protobuf definitions can be found {@inject: github:here:/pulsar-common/src/main/proto/PulsarApi.proto}. diff --git a/site2/website/versioned_docs/version-2.8.x/develop-load-manager.md b/site2/website/versioned_docs/version-2.8.x/develop-load-manager.md new file mode 100644 index 0000000000000..509209b6a852d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/develop-load-manager.md @@ -0,0 +1,227 @@ +--- +id: develop-load-manager +title: Modular load manager +sidebar_label: "Modular load manager" +original_id: develop-load-manager +--- + +The *modular load manager*, implemented in [`ModularLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/ModularLoadManagerImpl.java), is a flexible alternative to the previously implemented load manager, [`SimpleLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/SimpleLoadManagerImpl.java), which attempts to simplify how load is managed while also providing abstractions so that complex load management strategies may be implemented. + +## Usage + +There are two ways that you can enable the modular load manager: + +1. Change the value of the `loadManagerClassName` parameter in `conf/broker.conf` from `org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl` to `org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl`. +2. Using the `pulsar-admin` tool. Here's an example: + + ```shell + + $ pulsar-admin update-dynamic-config \ + --config loadManagerClassName \ + --value org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl + + ``` + + You can use the same method to change back to the original value. In either case, any mistake in specifying the load manager will cause Pulsar to default to `SimpleLoadManagerImpl`. + +## Verification + +There are a few different ways to determine which load manager is being used: + +1. Use `pulsar-admin` to examine the `loadManagerClassName` element: + + ```shell + + $ bin/pulsar-admin brokers get-all-dynamic-config + { + "loadManagerClassName" : "org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl" + } + + ``` + + If there is no `loadManagerClassName` element, then the default load manager is used. + +2. Consult a ZooKeeper load report. With the module load manager, the load report in `/loadbalance/brokers/...` will have many differences. for example the `systemResourceUsage` sub-elements (`bandwidthIn`, `bandwidthOut`, etc.) are now all at the top level. Here is an example load report from the module load manager: + + ```json + + { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 4.256510416666667 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 5.287239583333333 + }, + "bundles": [], + "cpu": { + "limit": 2400.0, + "usage": 5.7353247655435915 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + } + } + + ``` + + With the simple load manager, the load report in `/loadbalance/brokers/...` will look like this: + + ```json + + { + "systemResourceUsage": { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 0.0 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 0.0 + }, + "cpu": { + "limit": 2400.0, + "usage": 0.0 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + }, + "memory": { + "limit": 8192.0, + "usage": 3903.0 + } + } + } + + ``` + +3. The command-line [broker monitor](reference-cli-tools.md#monitor-brokers) will have a different output format depending on which load manager implementation is being used. + + Here is an example from the modular load manager: + + ``` + + =================================================================================================================== + ||SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.00 |48.33 |0.01 |0.00 |0.00 |48.33 || + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |4 |0 || + ||LATEST |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||SHORT |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||LONG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + =================================================================================================================== + + ``` + + Here is an example from the simple load manager: + + ``` + + =================================================================================================================== + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |0 |0 || + ||RAW SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.25 |47.94 |0.01 |0.00 |0.00 |47.94 || + ||ALLOC SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.20 |1.89 | |1.27 |3.21 |3.21 || + ||RAW MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.01 |0.01 |0.01 || + ||ALLOC MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |54.84 |134.48 |189.31 |126.54 |320.96 |447.50 || + =================================================================================================================== + + ``` + +It is important to note that the module load manager is _centralized_, meaning that all requests to assign a bundle---whether it's been seen before or whether this is the first time---only get handled by the _lead_ broker (which can change over time). To determine the current lead broker, examine the `/loadbalance/leader` node in ZooKeeper. + +## Implementation + +### Data + +The data monitored by the modular load manager is contained in the [`LoadData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/LoadData.java) class. +Here, the available data is subdivided into the bundle data and the broker data. + +#### Broker + +The broker data is contained in the [`BrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BrokerData.java) class. It is further subdivided into two parts, +one being the local data which every broker individually writes to ZooKeeper, and the other being the historical broker +data which is written to ZooKeeper by the leader broker. + +##### Local Broker Data +The local broker data is contained in the class [`LocalBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java) and provides information about the following resources: + +* CPU usage +* JVM heap memory usage +* Direct memory usage +* Bandwidth in/out usage +* Most recent total message rate in/out across all bundles +* Total number of topics, bundles, producers, and consumers +* Names of all bundles assigned to this broker +* Most recent changes in bundle assignments for this broker + +The local broker data is updated periodically according to the service configuration +"loadBalancerReportUpdateMaxIntervalMinutes". After any broker updates their local broker data, the leader broker will +receive the update immediately via a ZooKeeper watch, where the local data is read from the ZooKeeper node +`/loadbalance/brokers/` + +##### Historical Broker Data + +The historical broker data is contained in the [`TimeAverageBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/TimeAverageBrokerData.java) class. + +In order to reconcile the need to make good decisions in a steady-state scenario and make reactive decisions in a critical scenario, the historical data is split into two parts: the short-term data for reactive decisions, and the long-term data for steady-state decisions. Both time frames maintain the following information: + +* Message rate in/out for the entire broker +* Message throughput in/out for the entire broker + +Unlike the bundle data, the broker data does not maintain samples for the global broker message rates and throughputs, which is not expected to remain steady as new bundles are removed or added. Instead, this data is aggregated over the short-term and long-term data for the bundles. See the section on bundle data to understand how that data is collected and maintained. + +The historical broker data is updated for each broker in memory by the leader broker whenever any broker writes their local data to ZooKeeper. Then, the historical data is written to ZooKeeper by the leader broker periodically according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +##### Bundle Data + +The bundle data is contained in the [`BundleData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BundleData.java). Like the historical broker data, the bundle data is split into a short-term and a long-term time frame. The information maintained in each time frame: + +* Message rate in/out for this bundle +* Message Throughput In/Out for this bundle +* Current number of samples for this bundle + +The time frames are implemented by maintaining the average of these values over a set, limited number of samples, where +the samples are obtained through the message rate and throughput values in the local data. Thus, if the update interval +for the local data is 2 minutes, the number of short samples is 10 and the number of long samples is 1000, the +short-term data is maintained over a period of `10 samples * 2 minutes / sample = 20 minutes`, while the long-term +data is similarly over a period of 2000 minutes. Whenever there are not enough samples to satisfy a given time frame, +the average is taken only over the existing samples. When no samples are available, default values are assumed until +they are overwritten by the first sample. Currently, the default values are + +* Message rate in/out: 50 messages per second both ways +* Message throughput in/out: 50KB per second both ways + +The bundle data is updated in memory on the leader broker whenever any broker writes their local data to ZooKeeper. +Then, the bundle data is written to ZooKeeper by the leader broker periodically at the same time as the historical +broker data, according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +### Traffic Distribution + +The modular load manager uses the abstraction provided by [`ModularLoadManagerStrategy`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/ModularLoadManagerStrategy.java) to make decisions about bundle assignment. The strategy makes a decision by considering the service configuration, the entire load data, and the bundle data for the bundle to be assigned. Currently, the only supported strategy is [`LeastLongTermMessageRate`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LeastLongTermMessageRate.java), though soon users will have the ability to inject their own strategies if desired. + +#### Least Long Term Message Rate Strategy + +As its name suggests, the least long term message rate strategy attempts to distribute bundles across brokers so that +the message rate in the long-term time window for each broker is roughly the same. However, simply balancing load based +on message rate does not handle the issue of asymmetric resource burden per message on each broker. Thus, the system +resource usages, which are CPU, memory, direct memory, bandwidth in, and bandwidth out, are also considered in the +assignment process. This is done by weighting the final message rate according to +`1 / (overload_threshold - max_usage)`, where `overload_threshold` corresponds to the configuration +`loadBalancerBrokerOverloadedThresholdPercentage` and `max_usage` is the maximum proportion among the system resources +that is being utilized by the candidate broker. This multiplier ensures that machines with are being more heavily taxed +by the same message rates will receive less load. In particular, it tries to ensure that if one machine is overloaded, +then all machines are approximately overloaded. In the case in which a broker's max usage exceeds the overload +threshold, that broker is not considered for bundle assignment. If all brokers are overloaded, the bundle is randomly +assigned. + diff --git a/site2/website/versioned_docs/version-2.8.x/develop-schema.md b/site2/website/versioned_docs/version-2.8.x/develop-schema.md new file mode 100644 index 0000000000000..2d4461a5ea2b5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/develop-schema.md @@ -0,0 +1,62 @@ +--- +id: develop-schema +title: Custom schema storage +sidebar_label: "Custom schema storage" +original_id: develop-schema +--- + +By default, Pulsar stores data type [schemas](concepts-schema-registry.md) in [Apache BookKeeper](https://bookkeeper.apache.org) (which is deployed alongside Pulsar). You can, however, use another storage system if you wish. This doc walks you through creating your own schema storage implementation. + +In order to use a non-default (i.e. non-BookKeeper) storage system for Pulsar schemas, you need to implement two Java interfaces: [`SchemaStorage`](#schemastorage-interface) and [`SchemaStorageFactory`](#schemastoragefactory-interface). + +## SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +> For a full-fledged example schema storage implementation, see the [`BookKeeperSchemaStorage`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +## SchemaStorageFactory interface + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +> For a full-fledged example schema storage factory implementation, see the [`BookKeeperSchemaStorageFactory`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +## Deployment + +In order to use your custom schema storage implementation, you'll need to: + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. +1. Add that jar to the `lib` folder in your Pulsar [binary or source distribution](getting-started-standalone.md#installing-pulsar). +1. Change the `schemaRegistryStorageClassName` configuration in [`broker.conf`](reference-configuration.md#broker) to your custom factory class (i.e. the `SchemaStorageFactory` implementation, not the `SchemaStorage` implementation). +1. Start up Pulsar. diff --git a/site2/website/versioned_docs/version-2.8.x/develop-tools.md b/site2/website/versioned_docs/version-2.8.x/develop-tools.md new file mode 100644 index 0000000000000..b5457790b8081 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/develop-tools.md @@ -0,0 +1,111 @@ +--- +id: develop-tools +title: Simulation tools +sidebar_label: "Simulation tools" +original_id: develop-tools +--- + +It is sometimes necessary create an test environment and incur artificial load to observe how well load managers +handle the load. The load simulation controller, the load simulation client, and the broker monitor were created as an +effort to make create this load and observe the effects on the managers more easily. + +## Simulation Client +The simulation client is a machine which will create and subscribe to topics with configurable message rates and sizes. +Because it is sometimes necessary in simulating large load to use multiple client machines, the user does not interact +with the simulation client directly, but instead delegates their requests to the simulation controller, which will then +send signals to clients to start incurring load. The client implementation is in the class +`org.apache.pulsar.testclient.LoadSimulationClient`. + +### Usage +To Start a simulation client, use the `pulsar-perf` script with the command `simulation-client` as follows: + +``` + +pulsar-perf simulation-client --port --service-url + +``` + +The client will then be ready to receive controller commands. +## Simulation Controller +The simulation controller send signals to the simulation clients, requesting them to create new topics, stop old +topics, change the load incurred by topics, as well as several other tasks. It is implemented in the class +`org.apache.pulsar.testclient.LoadSimulationController` and presents a shell to the user as an interface to send +command with. + +### Usage +To start a simulation controller, use the `pulsar-perf` script with the command `simulation-controller` as follows: + +``` + +pulsar-perf simulation-controller --cluster --client-port +--clients + +``` + +The clients should already be started before the controller is started. You will then be presented with a simple prompt, +where you can issue commands to simulation clients. Arguments often refer to tenant names, namespace names, and topic +names. In all cases, the BASE name of the tenants, namespaces, and topics are used. For example, for the topic +`persistent://my_tenant/my_cluster/my_namespace/my_topic`, the tenant name is `my_tenant`, the namespace name is +`my_namespace`, and the topic name is `my_topic`. The controller can perform the following actions: + +* Create a topic with a producer and a consumer + * `trade [--rate ] + [--rand-rate ,] + [--size ]` +* Create a group of topics with a producer and a consumer + * `trade_group [--rate ] + [--rand-rate ,] + [--separation ] [--size ] + [--topics-per-namespace ]` +* Change the configuration of an existing topic + * `change [--rate ] + [--rand-rate ,] + [--size ]` +* Change the configuration of a group of topics + * `change_group [--rate ] [--rand-rate ,] + [--size ] [--topics-per-namespace ]` +* Shutdown a previously created topic + * `stop ` +* Shutdown a previously created group of topics + * `stop_group ` +* Copy the historical data from one ZooKeeper to another and simulate based on the message rates and sizes in that history + * `copy [--rate-multiplier value]` +* Simulate the load of the historical data on the current ZooKeeper (should be same ZooKeeper being simulated on) + * `simulate [--rate-multiplier value]` +* Stream the latest data from the given active ZooKeeper to simulate the real-time load of that ZooKeeper. + * `stream [--rate-multiplier value]` + +The "group" arguments in these commands allow the user to create or affect multiple topics at once. Groups are created +when calling the `trade_group` command, and all topics from these groups may be subsequently modified or stopped +with the `change_group` and `stop_group` commands respectively. All ZooKeeper arguments are of the form +`zookeeper_host:port`. + +### Difference Between Copy, Simulate, and Stream +The commands `copy`, `simulate`, and `stream` are very similar but have significant differences. `copy` is used when +you want to simulate the load of a static, external ZooKeeper on the ZooKeeper you are simulating on. Thus, +`source zookeeper` should be the ZooKeeper you want to copy and `target zookeeper` should be the ZooKeeper you are +simulating on, and then it will get the full benefit of the historical data of the source in both load manager +implementations. `simulate` on the other hand takes in only one ZooKeeper, the one you are simulating on. It assumes +that you are simulating on a ZooKeeper that has historical data for `SimpleLoadManagerImpl` and creates equivalent +historical data for `ModularLoadManagerImpl`. Then, the load according to the historical data is simulated by the +clients. Finally, `stream` takes in an active ZooKeeper different than the ZooKeeper being simulated on and streams +load data from it and simulates the real-time load. In all cases, the optional `rate-multiplier` argument allows the +user to simulate some proportion of the load. For instance, using `--rate-multiplier 0.05` will cause messages to +be sent at only `5%` of the rate of the load that is being simulated. + +## Broker Monitor +To observe the behavior of the load manager in these simulations, one may utilize the broker monitor, which is +implemented in `org.apache.pulsar.testclient.BrokerMonitor`. The broker monitor will print tabular load data to the +console as it is updated using watchers. + +### Usage +To start a broker monitor, use the `monitor-brokers` command in the `pulsar-perf` script: + +``` + +pulsar-perf monitor-brokers --connect-string + +``` + +The console will then continuously print load data until it is interrupted. + diff --git a/site2/website/versioned_docs/version-2.8.x/developing-binary-protocol.md b/site2/website/versioned_docs/version-2.8.x/developing-binary-protocol.md new file mode 100644 index 0000000000000..9084b006b9bed --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/developing-binary-protocol.md @@ -0,0 +1,616 @@ +--- +id: developing-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: "Binary protocol" +original_id: developing-binary-protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a [`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:------------|:----------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:-------------|:--------------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| magicNumber | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| checksum | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| metadataSize | The size of the message [metadata](#message-metadata) | 4 | +| metadata | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| payload | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed on unchanged to the consumer. + +| Field | Description | +|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | The name of the producer that published the message | +| `sequence_id` | The sequence ID of the message, assigned by producer | +| `publish_time` | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` *(optional)* | Indicates that the message has been replicated and specifies the name of the [cluster](reference-terminology.md#cluster) where the message was originally published | +| `partition_key` *(optional)* | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose | +| `compression` *(optional)* | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` *(optional)* | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` *(optional)* | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Description | +|:--------------|:------------------------------------------------------------| +| metadataSizeN | The size of the single message metadata serialized Protobuf | +| metadataN | Single message metadata | +| payloadN | Message payload passed by application | + +Each metadata field looks like this; + +| Field | Description | +|:---------------------------|:--------------------------------------------------------| +| properties | Application-defined properties | +| partition key *(optional)* | Key to indicate the hashing to a particular partition | +| payload_size | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf + +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} + +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf + +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} + +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer id negotiated before. + +![Producer interaction](/assets/binary-protocol-producer.png) + +##### Command Producer + +```protobuf + +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf + +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} + +``` + +Parameters: + * `request_id` → Original id of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. This command is used in a frame that includes command +as well as message payload, for which the complete format is specified in the [payload commands](#payload-commands) section. + +```protobuf + +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} + +``` + +Parameters: + * `producer_id` → id of an existing producer + * `sequence_id` → each message has an associated sequence id which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + +```protobuf + +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `producer_id` → id of producer originating the send request + * `sequence_id` → sequence id of the published message + * `message_id` → message id assigned by the system to the published message + Unique within a single cluster. Message id is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique id is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/assets/binary-protocol-consumer.png) + +:::note + +In 2.8.4 and later versions, if the client does not receive a response indicating the success or failure of consumer creation, it first sends a command to close the original consumer before sending a command to re-attempt consumer creation. + +::: + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued half of the messages in the queue, the consumer +sends permits to the broker to ask for more messages (equals to half of the messages in the queue). + +For example, if the queue size is 1000 and the consumer consumes 500 messages in the queue. +Then the consumer sends permits to the broker to ask for 500 messages. + +##### Command Subscribe + +```protobuf + +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover, Key_Shared + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf + +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} + +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf + +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf + +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + * `properties` → *(optional)* Reserved configuration items + * `txnid_most_bits` → *(optional)* Same as Transaction Coordinator ID, `txnid_most_bits` and `txnid_least_bits` + uniquely identify a transaction. + * `txnid_least_bits` → *(optional)* The ID of the transaction opened in a transaction coordinator, + `txnid_most_bits` and `txnid_least_bits`uniquely identify a transaction. + * `request_id` → *(optional)* ID for handling response and timeout. + + + ##### Command AckResponse + +An `AckResponse` is the broker’s response to acknowledge a request sent by the client. It contains the `consumer_id` sent in the request. +If a transaction is used, it contains both the Transaction ID and the Request ID that are sent in the request. The client finishes the specific request according to the Request ID. If the `error` field is set, it indicates that the request has failed. + +An example of `AckResponse` with redirection: + +```protobuf + +message CommandAckResponse { + "consumer_id" : 1, + "txnid_least_bits" = 0, + "txnid_most_bits" = 1, + "request_id" = 5 +} + +``` + +##### Command CloseConsumer + +This command behaves the same as [`CloseProducer`](#command-closeproducer) and can be sent by either producer or broker. + +:::note + +In 2.8.4 and later versions, if the client does not receive a response to a `Subscribe` command within a timeout, the client must first send a `CloseConsumer` command before sending another `Subscribe` command. The client does not need to await a response to the `CloseConsumer` command before sending the next `Subscribe` command. + +::: + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retrieve Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the [admin API](admin-api-topics.md#look-up-topics-owner-broker) +docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/assets/binary-protocol-topic-lookup.png) + +```protobuf + +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} + +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} + +``` + +Example of lookup response with redirection: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} + +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf + +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} + +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf + +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} + +``` + +## Protobuf interface + +All Pulsar's Protobuf definitions can be found {@inject: github:here:/pulsar-common/src/main/proto/PulsarApi.proto}. diff --git a/site2/website/versioned_docs/version-2.8.x/functions-cli.md b/site2/website/versioned_docs/version-2.8.x/functions-cli.md new file mode 100644 index 0000000000000..dda6590c2d348 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-cli.md @@ -0,0 +1,198 @@ +--- +id: functions-cli +title: Pulsar Functions command line tool +sidebar_label: "Reference: CLI" +original_id: functions-cli +--- + +The following tables list Pulsar Functions command-line tools. You can learn Pulsar Functions modes, commands, and parameters. + +## localrun + +Run Pulsar Functions locally, rather than deploying it to the Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +broker-service-url | The URL for the Pulsar broker. | | +classname | The class name of a Pulsar Function.| | +client-auth-params | Client authentication parameter. | | +client-auth-plugin | Client authentication plugin using which function-process can connect to broker. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from.| | +hostname-verification-enabled | Enable hostname verification. | false +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service, only available in 2.8.1 and later versions)] where worker can download the package from. | | +instance-id-offset | Start the instanceIds from this offset. | 0 +log-topic | The topic to which the logs a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +tls-allow-insecure | Allow insecure tls connection. | false +tls-trust-cert-path | tls trust cert file path. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +use-tls | Use tls connection. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + + +## create + +Create and deploy a Pulsar Function in cluster mode. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from.| | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service, only available in 2.8.1 and later versions)] where worker can download the package from. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from.| | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## delete + +Delete a Pulsar Function that is running on a Pulsar cluster. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## update + +Update a Pulsar Function that has been deployed to a Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime). | | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from. | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service, only available in 2.8.1 and later versions)] where worker can download the package from. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). Since 2.8.1 and later versions, it also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] where worker can download the package from.| | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +update-auth-data | Whether or not to update the auth data. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## get + +Fetch information about a Pulsar Function. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## restart + +Restart function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## stop + +Stops function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## start + +Starts a stopped function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | diff --git a/site2/website/versioned_docs/version-2.8.x/functions-debug.md b/site2/website/versioned_docs/version-2.8.x/functions-debug.md new file mode 100644 index 0000000000000..e1d55ae0897aa --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-debug.md @@ -0,0 +1,533 @@ +--- +id: functions-debug +title: Debug Pulsar Functions +sidebar_label: "How-to: Debug" +original_id: functions-debug +--- + +You can use the following methods to debug Pulsar Functions: + +* [Captured stderr](functions-debug.md#captured-stderr) +* [Use unit test](functions-debug.md#use-unit-test) +* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) +* [Use log topic](functions-debug.md#use-log-topic) +* [Use Functions CLI](functions-debug.md#use-functions-cli) + +## Captured stderr + +Function startup information and captured stderr output is written to `logs/functions////-.log` + +This is useful for debugging why a function fails to start. + +## Use unit test + +A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. + +For example, if you have the following Pulsar Function: + +```java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +You can write a simple unit test to test Pulsar Function. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} + +``` + +The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} + +``` + +## Debug with localrun mode +When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. + +In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. + +:::note + +Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. + +::: + +You can launch your function in the following manner. + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); + +``` + +So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. + +The following example illustrates how to programmatically launch a function in localrun mode. + +```java + +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} + +``` + +To use localrun mode programmatically, add the following dependency. + +```xml + + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). + +:::note + +Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. + +::: + +## Use log topic + +In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +**Example** + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. + +**Example** + +```bash + +$ bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +## Use Functions CLI + +With [Pulsar Functions CLI](reference-pulsar-admin.md#functions), you can debug Pulsar Functions with the following subcommands: + +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +:::tip + +For complete commands of **Pulsar Functions CLI**, see [here](reference-pulsar-admin.md#functions)。 + +::: + +### `get` + +Get information about a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions get options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +:::tip + +`--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. + +::: + +**Example** + +You can specify `--fqfn` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 + +``` + +Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 + +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} + +``` + +### `status` + +Check the current status of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions status options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```json + +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} + +``` + +### `stats` + +Get the current stats of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions stats options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function.
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +The output is shown as follows: + +```json + +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} + +``` + +### `list` + +List all Pulsar Functions running under a specific tenant and namespace. + +**Usage** + +```bash + +$ pulsar-admin functions list options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + +As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. + +```text + +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 + +``` + +### `trigger` + +Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. + +**Usage** + +```bash + +$ pulsar-admin functions trigger options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. +|`--topic`|The topic name that a Pulsar Function consumes from. +|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. +|`--trigger-value`|The value to trigger a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" + +``` + +As shown below, the `trigger` command returns the following result: + +```text + +This is my function! + +``` + +:::note + +You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. + +```text + +Function in trigger function has unidentified topic +Reason: Function in trigger function has unidentified topic + +``` + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/functions-deploy.md b/site2/website/versioned_docs/version-2.8.x/functions-deploy.md new file mode 100644 index 0000000000000..d9496b3ed5b48 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-deploy.md @@ -0,0 +1,241 @@ +--- +id: functions-deploy +title: Deploy Pulsar Functions +sidebar_label: "How-to: Deploy" +original_id: functions-deploy +--- + +## Requirements + +To deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: + +* You can run a [standalone cluster](getting-started-standalone.md) locally on your own machine. +* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), [DC/OS](https://dcos.io/), and more. + +If you run a non-[standalone](reference-terminology.md#standalone) cluster, you need to obtain the service URL for the cluster. How you obtain the service URL depends on how you deploy your Pulsar cluster. + +If you want to deploy and trigger Python user-defined functions, you need to install [the pulsar python client](http://pulsar.apache.org/docs/en/client-libraries-python/) on all the machines running [functions workers](functions-worker.md). + +## Command-line interface + +Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](reference-pulsar-admin.md#functions) interface, which contains commands such as [`create`](reference-pulsar-admin.md#functions-create) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](reference-pulsar-admin.md#trigger) for [triggering](#triggering-pulsar-functions) functions, [`list`](reference-pulsar-admin.md#list-2) for listing deployed functions. + +To learn more commands, refer to [`pulsar-admin functions`](reference-pulsar-admin.md#functions). + +### Default arguments + +When managing Pulsar Functions, you need to specify a variety of information about functions, including tenant, namespace, input and output topics, and so on. However, some parameters have default values if you do not specify values for them. The following table lists the default values. + +Parameter | Default +:---------|:------- +Function name | You can specify any value for the class name (except org, library, or similar class names). For example, when you specify the flag `--classname org.example.MyFunction`, the function name is `MyFunction`. +Tenant | Derived from names of the input topics. If the input topics are under the `marketing` tenant, which means the topic names have the form `persistent://marketing/{namespace}/{topicName}`, the tenant is `marketing`. +Namespace | Derived from names of the input topics. If the input topics are under the `asia` namespace under the `marketing` tenant, which means the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace is `asia`. +Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming`, and the function name is `exclamation`, then the name of the output topic is `incoming-exclamation-output`. +Subscription type | For `at-least-once` and `at-most-once` [processing guarantees](functions-overview.md#processing-guarantees), the [`SHARED`](concepts-messaging.md#shared) mode is applied by default; for `effectively-once` guarantees, the [`FAILOVER`](concepts-messaging.md#failover) mode is applied. +Processing guarantees | [`ATLEAST_ONCE`](functions-overview.md#processing-guarantees) +Pulsar service URL | `pulsar://localhost:6650` + +### Example of default arguments + +Take the `create` command as an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --classname org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 + +``` + +The function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +## Local run mode + +If you run a Pulsar Function in **local run** mode, it runs on the machine from which you enter the commands (on your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, and so on). The following is a [`localrun`](reference-pulsar-admin.md#localrun) command example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +By default, the function connects to a Pulsar cluster running on the same machine, via a local [broker](reference-terminology.md#broker) service URL of `pulsar://localhost:6650`. If you use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. The following is an example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --broker-service-url pulsar://my-cluster-host:6650 \ + # Other function parameters + +``` + +## Cluster mode + +When you run a Pulsar Function in **cluster** mode, the function code is uploaded to a Pulsar broker and runs *alongside the broker* rather than in your [local environment](#local-run-mode). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#create-1) command. + +```bash + +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +### Update functions in cluster mode + +You can use the [`update`](reference-pulsar-admin.md#update-1) command to update a Pulsar Function running in cluster mode. The following command updates the function created in the [cluster mode](#cluster-mode) section. + +```bash + +$ bin/pulsar-admin functions update \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic + +``` + +### Parallelism + +Pulsar Functions run as processes or threads, which are called **instances**. When you run a Pulsar Function, it runs as a single instance by default. With one localrun command, you can only run a single instance of a function. If you want to run multiple instances, you can use localrun command multiple times. + +When you create a function, you can specify the *parallelism* of a function (the number of instances to run). You can set the parallelism factor using the `--parallelism` flag of the [`create`](reference-pulsar-admin.md#functions-create) command. + +```bash + +$ bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info + +``` + +You can adjust the parallelism of an already created function using the [`update`](reference-pulsar-admin.md#update-1) interface. + +```bash + +$ bin/pulsar-admin functions update \ + --parallelism 5 \ + # Other function + +``` + +If you specify a function configuration via YAML, use the `parallelism` parameter. The following is a config file example. + +```yaml + +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters + +``` + +The following is corresponding update command. + +```bash + +$ bin/pulsar-admin functions update \ + --function-config-file function-config.yaml + +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster mode](#cluster-mode), you can specify the resources that are assigned to each function [instance](#parallelism). + +Resource | Specified as | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Kubernetes +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +The following function creation command allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --classname org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 + +``` + +> #### Resources are *per instance* +> The resources that you apply to a given Pulsar Function are applied to each instance of the function. For example, if you apply 8 GB of RAM to a function with a parallelism of 5, you are applying 40 GB of RAM for the function in total. Make sure that you take the parallelism (the number of instances) factor into your resource calculations. + +## Trigger Pulsar Functions + +If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function output (if any) via the command line. + +> Triggering a function is to invoke a function by producing a message on one of the input topics. With the [`pulsar-admin functions trigger`](reference-pulsar-admin.md#trigger) command, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +To learn how to trigger a function, you can start with Python function that returns a simple string based on the input. + +```python + +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) + +``` + +You can run the function in [local run mode](functions-deploy.md#local-run-mode). + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --classname myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out + +``` + +Then assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. + +```bash + +$ bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription + --num-messages 0 # Listen indefinitely + +``` + +And then you can trigger the function. + +```bash + +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc \ + --trigger-value "hello world" + +``` + +The consumer listening on the output topic produces something as follows in the log. + +``` + +----- got message ----- +This function has been triggered with a value of hello world + +``` + +> #### Topic info is not required +> In the `trigger` command, you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you do not need to know the function input topics. diff --git a/site2/website/versioned_docs/version-2.8.x/functions-develop.md b/site2/website/versioned_docs/version-2.8.x/functions-develop.md new file mode 100644 index 0000000000000..2e29aa1c47400 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-develop.md @@ -0,0 +1,1600 @@ +--- +id: functions-develop +title: Develop Pulsar Functions +sidebar_label: "How-to: Develop" +original_id: functions-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +You learn how to develop Pulsar Functions with different APIs for Java, Python and Go. + +## Available APIs +In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). +Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). + +The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. + +````mdx-code-block + + + +```Java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + + + +```python + +def process(input): + return "{}!".format(input) + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +:::note + +You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. +If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to +start the functions. In this case, you can create a symlink. Your system will fail if +you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). + +```bash + +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +``` + +::: + + + + +```` + +The following example uses Pulsar Functions SDK. +````mdx-code-block + + + +```Java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + + + +```python + +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + + + +```Go + +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/inputFunc/inputFunc.go#L20-L36). + + + + +```` + +## Schema registry +Pulsar has a built-in schema registry and is bundled with popular schema types, such as Avro, JSON and Protobuf. Pulsar Functions can leverage the existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. + +## SerDe +SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. + +````mdx-code-block + + + +When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: `String`, `Double`, `Integer`, `Float`, `Long`, `Short`, and `Byte`. + +To customize Java types, you need to implement the following interface. + +```java + +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} + +``` + +SerDe works in the following ways in Java Functions. +- If the input and output topics have schema, Pulsar Functions use schema for SerDe. +- If the input or output topics do not exist, Pulsar Functions adopt the following rules to determine SerDe: + - If the schema type is specified, Pulsar Functions use the specified schema type. + - If SerDe is specified, Pulsar Functions use the specified SerDe, and the schema type for input and output topics is `Byte`. + - If neither the schema type nor SerDe is specified, Pulsar Functions use the built-in SerDe. For non-primitive schema type, the built-in SerDe serializes and deserializes objects in the `JSON` format. + + + + +In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. + +You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 + +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. +2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe. + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you work with simple types like strings, Booleans, integers. +`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. + + + + +Currently, the feature is not available in Go. + + + + +```` + +### Example +Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. + +````mdx-code-block + + + +```java + +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} + +``` + +To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java + +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} + +``` + +To apply this customized SerDe to a particular Pulsar Function, you need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of [`create`](reference-pulsar-admin.md#create-1) operation. + +```bash + +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes + +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. + + + + +```python + +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content + +``` + +In order to use this class in Pulsar Functions, you have two options: + +1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. +2. You can create your own SerDe class. The following is an example. + + ```python + + from pulsar import SerDe + + class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + + ``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). + + + + +```` + +In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Context +Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. + +* The name and ID of a Pulsar Function. +* The message ID of each message. Each Pulsar message is automatically assigned with an ID. +* The key, event time, properties and partition key of each message. +* The name of the topic to which the message is sent. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](#serde). +* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. +* The ID of the Pulsar Functions instance running the function. +* The version of the function. +* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. +* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. +* An interface for recording [metrics](#metrics). +* An interface for storing and retrieving state in [state storage](#state-storage). +* A function to publish new messages onto arbitrary topics. +* A function to ack the message being processed (if auto-ack is disabled). +* (Java) get Pulsar admin client. + +````mdx-code-block + + + +The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. + +```java + +public interface Context { + Record getCurrentRecord(); + Collection getInputTopics(); + String getOutputTopic(); + String getOutputSchemaType(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + void incrCounter(String key, long amount); + void incrCounterAsync(String key, long amount); + long getCounter(String key); + long getCounterAsync(String key); + void putState(String key, ByteBuffer value); + void putStateAsync(String key, ByteBuffer value); + void deleteState(String key); + ByteBuffer getState(String key); + ByteBuffer getStateAsync(String key); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + Object getUserConfigValueOrDefault(String key, Object defaultValue); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); + CompletableFuture publish(String topicName, O object); + TypedMessageBuilder newOutputMessage(String topicName, Schema schema) throws PulsarClientException; + ConsumerBuilder newConsumerBuilder(Schema schema) throws PulsarClientException; + PulsarAdmin getPulsarAdmin(); + PulsarAdmin getPulsarAdmin(String clusterName); +} + +``` + +The following example uses several methods available via the `Context` object. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} + +``` + + + + +``` + +class ContextImpl(pulsar.Context): + def get_message_id(self): + ... + def get_message_key(self): + ... + def get_message_eventtime(self): + ... + def get_message_properties(self): + ... + def get_current_message_topic_name(self): + ... + def get_partition_key(self): + ... + def get_function_name(self): + ... + def get_function_tenant(self): + ... + def get_function_namespace(self): + ... + def get_function_id(self): + ... + def get_instance_id(self): + ... + def get_function_version(self): + ... + def get_logger(self): + ... + def get_user_config_value(self, key): + ... + def get_user_config_map(self): + ... + def record_metric(self, metric_name, metric_value): + ... + def get_input_topics(self): + ... + def get_output_topic(self): + ... + def get_output_serde_class_name(self): + ... + def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", + properties=None, compression_type=None, callback=None, message_conf=None): + ... + def ack(self, msgid, topic): + ... + def get_and_reset_metrics(self): + ... + def reset_metrics(self): + ... + def get_metrics(self): + ... + def incr_counter(self, key, amount): + ... + def get_counter(self, key): + ... + def del_counter(self, key): + ... + def put_state(self, key, value): + ... + def get_state(self, key): + ... + +``` + + + + +``` + +func (c *FunctionContext) GetInstanceID() int { + return c.instanceConf.instanceID +} + +func (c *FunctionContext) GetInputTopics() []string { + return c.inputTopics +} + +func (c *FunctionContext) GetOutputTopic() string { + return c.instanceConf.funcDetails.GetSink().Topic +} + +func (c *FunctionContext) GetFuncTenant() string { + return c.instanceConf.funcDetails.Tenant +} + +func (c *FunctionContext) GetFuncName() string { + return c.instanceConf.funcDetails.Name +} + +func (c *FunctionContext) GetFuncNamespace() string { + return c.instanceConf.funcDetails.Namespace +} + +func (c *FunctionContext) GetFuncID() string { + return c.instanceConf.funcID +} + +func (c *FunctionContext) GetFuncVersion() string { + return c.instanceConf.funcVersion +} + +func (c *FunctionContext) GetUserConfValue(key string) interface{} { + return c.userConfigs[key] +} + +func (c *FunctionContext) GetUserConfMap() map[string]interface{} { + return c.userConfigs +} + +func (c *FunctionContext) SetCurrentRecord(record pulsar.Message) { + c.record = record +} + +func (c *FunctionContext) GetCurrentRecord() pulsar.Message { + return c.record +} + +func (c *FunctionContext) NewOutputMessage(topic string) pulsar.Producer { + return c.outputMessage(topic) +} + +``` + +The following example uses several methods available via the `Context` object. + +``` + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func contextFunc(ctx context.Context) { + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/contextFunc/contextFunc.go#L29-L34). + + + + +```` + +### User config +When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--user-config` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --user-config '{"forbidden-word":"rosebud"}' + +``` + +````mdx-code-block + + + +The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Java function: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} + +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. + + + + +In Python function, you can access the configuration value like this. + +```python + +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input + +``` + +The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Python function: + +```python + +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) + +``` + + + + +The Go SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + --go path/to/go/binary + --user-config '{"word-of-the-day":"lackadaisical"}' + +``` + +To access that value in a Go function: + +```go + +func contextFunc(ctx context.Context) { + fc, ok := pf.FromContext(ctx) + if !ok { + logutil.Fatal("Function context is not defined") + } + + wotd := fc.GetUserConfValue("word-of-the-day") + + if wotd == nil { + logutil.Warn("The word of the day is empty") + } else { + logutil.Infof("The word of the day is %s", wotd.(string)) + } +} + +``` + + + + +```` + +### Logger + +````mdx-code-block + + + +Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + +#### Customize Function log level +Additionally, you can use the XML file, `functions_log4j2.xml`, to customize the function log level. +To customize the function log level, create or update `functions_log4j2.xml` in your Pulsar conf directory (for example, `/etc/pulsar/` on bare-metal, or `/pulsar/conf` on Kubernetes) to contain contents such as: + +```xml + + + pulsar-functions-instance + 30 + + + pulsar.log.appender + RollingFile + + + pulsar.log.level + debug + + + bk.log.level + debug + + + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + RollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.log + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}*log.gz + + + 30d + + + + + + BkRollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}.bk*log.gz + + + 30d + + + + + + + + org.apache.pulsar.functions.runtime.shaded.org.apache.bookkeeper + ${sys:bk.log.level} + false + + BkRollingFile + + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + + + +``` + +The properties set like: + +```xml + + + pulsar.log.level + debug + + +``` + +propagate to places where they are referenced, such as: + +```xml + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + +``` + +In the above example, debug level logging would be applied to ALL function logs. +This may be more verbose than you desire. To be more selective, you can apply different log levels to different classes or modules. For example: + +```xml + + + com.example.module + info + false + + ${sys:pulsar.log.appender} + + + +``` + +You can be more specific as well, such as applying a more verbose log level to a class in the module, such as: + +```xml + + + com.example.module.className + debug + false + + Console + + + +``` + +Each `` entry allows you to output the log to a target specified in the definition of the Appender. + +Additivity pertains to whether log messages will be duplicated if multiple Logger entries overlap. +To disable additivity, specify + +```xml + +false + +``` + +as shown in examples above. Disabling additivity prevents duplication of log messages when one or more `` entries contain classes or modules that overlap. + +The `` is defined in the `` section, such as: + +```xml + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + +``` + + + + +Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python + +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) + +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. +Additionally, you can specify the function log level through the broker XML file as described in [Customize Function log level](#customize-function-log-level). + + + + +The following Go Function example shows different log levels based on the function input. + +``` + +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/pf" + + log "github.com/apache/pulsar/pulsar-function-go/logutil" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} + +``` + +When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/logutil`, and you do not have to use the `getLogger()` context object. + +Additionally, you can specify the function log level through the broker XML file, as described here: [Customize Function log level](#customize-function-log-level) + + + + +```` + +### Pulsar admin + +Pulsar Functions using the Java SDK has access to the Pulsar admin client, which allows the Pulsar admin client to manage API calls to current Pulsar clusters or external clusters (if `external-pulsars` is provided). + +````mdx-code-block + + + +Below is an example of how to use the Pulsar admin client exposed from the Function `context`. + +``` + +import org.apache.pulsar.client.admin.PulsarAdmin; +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +/** + * In this particular example, for every input message, + * the function resets the cursor of the current function's subscription to a + * specified timestamp. + */ +public class CursorManagementFunction implements Function { + + @Override + public String process(String input, Context context) throws Exception { + PulsarAdmin adminClient = context.getPulsarAdmin(); + if (adminClient != null) { + String topic = context.getCurrentRecord().getTopicName().isPresent() ? + context.getCurrentRecord().getTopicName().get() : null; + String subName = context.getTenant() + "/" + context.getNamespace() + "/" + context.getFunctionName(); + if (topic != null) { + // 1578188166 below is a random-pick timestamp + adminClient.topics().resetCursor(topic, subName, 1578188166); + return "reset cursor successfully"; + } + } + return null; + } +} + +``` + +If you want your function to get access to the Pulsar admin client, you need to enable this feature by setting `exposeAdminClientEnabled=true` in the `functions_worker.yml` file. You can test whether this feature is enabled or not using the command `pulsar-admin functions localrun` with the flag `--web-service-url`. + +``` + +$ bin/pulsar-admin functions localrun \ + --jar my-functions.jar \ + --classname my.package.CursorManagementFunction \ + --web-service-url http://pulsar-web-service:8080 \ + # Other function configs + +``` + + + + +```` + +## Metrics + +Pulsar Functions allows you to deploy and manage processing functions that consume messages from and publish messages to Pulsar topics easily. It is important to ensure that the running functions are healthy at any time. Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can monitor Pulsar Functions that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar Functions expose the metrics that can be collected and used for monitoring the health of **Java, Python, and Go** functions. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + + For the complete list of the function metrics, see [here](reference-metrics.md#pulsar-functions). + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java and Python** functions. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here are examples of how to customize metrics for Java and Python functions. + +````mdx-code-block + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} + +``` + + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. + +```python + +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) + +``` + + + + +Currently, the feature is not available in Go. + + + + +```` + +## Security + +If you want to enable security on Pulsar Functions, first you should enable security on [Functions Workers](functions-worker.md). For more details, refer to [Security settings](functions-worker.md#security-settings). + +Pulsar Functions can support the following providers: + +- ClearTextSecretsProvider +- EnvironmentBasedSecretsProvider + +> Pulsar Function supports ClearTextSecretsProvider by default. + +At the same time, Pulsar Functions provides two interfaces, **SecretsProvider** and **SecretsProviderConfigurator**, allowing users to customize secret provider. + +````mdx-code-block + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class GetSecretProviderFunction implements Function { + + @Override + public Void process(String input, Context context) throws Exception { + Logger LOG = context.getLogger(); + String secretProvider = context.getSecret(input); + + if (!secretProvider.isEmpty()) { + LOG.info("The secret provider is {}", secretProvider); + } else { + LOG.warn("No secret provider"); + } + + return null; + } +} + +``` + + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```python + +from pulsar import Function + +class GetSecretProviderFunction(Function): + def process(self, input, context): + logger = context.get_logger() + secret_provider = context.get_secret(input) + if secret_provider is None: + logger.warn('No secret provider') + else: + logger.info("The secret provider is {0}".format(secret_provider)) + +``` + + + + +Currently, the feature is not available in Go. + + + + +```` + +## State storage +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. + +Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. + +You can access states within Pulsar Java Functions using the `putState`, `putStateAsync`, `getState`, `getStateAsync`, `incrCounter`, `incrCounterAsync`, `getCounter`, `getCounterAsync` and `deleteState` calls on the context object. You can access states within Pulsar Python Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. + +:::note + +State storage is not available in Go. + +::: + +### API + +````mdx-code-block + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. + +#### incrCounter + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +The application can use `incrCounter` to change the counter of a given `key` by the given `amount`. + +#### incrCounterAsync + +```java + + /** + * Increment the builtin distributed counter referred by key + * but dont wait for the completion of the increment operation + * + * @param key The name of the key + * @param amount The amount to be incremented + */ + CompletableFuture incrCounterAsync(String key, long amount); + +``` + +The application can use `incrCounterAsync` to asynchronously change the counter of a given `key` by the given `amount`. + +#### getCounter + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +The application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### getCounterAsync + +```java + + /** + * Retrieve the counter value for the key, but don't wait + * for the operation to be completed + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + CompletableFuture getCounterAsync(String key); + +``` + +The application can use `getCounterAsync` to asynchronously retrieve the counter of a given `key` mutated by `incrCounterAsync`. + +#### putState + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +#### putStateAsync + +```java + + /** + * Update the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @param value state value of the key + */ + CompletableFuture putStateAsync(String key, ByteBuffer value); + +``` + +The application can use `putStateAsync` to asynchronously update the state of a given `key`. + +#### getState + +```java + + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); + +``` + +#### getStateAsync + +```java + + /** + * Retrieve the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @return the state value for the key. + */ + CompletableFuture getStateAsync(String key); + +``` + +The application can use `getStateAsync` to asynchronously retrieve the state of a given `key`. + +#### deleteState + +```java + + /** + * Delete the state value for the key. + * + * @param key name of the key + */ + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. + +#### incr_counter + +```python + + def incr_counter(self, key, amount): + ""incr the counter of a given key in the managed state"" + +``` + +Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +#### get_counter + +```python + + def get_counter(self, key): + """get the counter of a given key in the managed state""" + +``` + +Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### put_state + +```python + + def put_state(self, key, value): + """update the value of a given key in the managed state""" + +``` + +The key is a string, and the value is arbitrary binary data. + +#### get_state + +```python + + def get_state(self, key): + """get the value of a given key in the managed state""" + +``` + +#### del_counter + +```python + + def del_counter(self, key): + """delete the counter of a given key in the managed state""" + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +```` + +### Query State + +A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage +and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides +CLI commands for querying its state. + +```shell + +$ bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] + +``` + +If `--watch` is specified, the CLI will watch the value of the provided `state-key`. + +### Example + +````mdx-code-block + + + +{@inject: github:WordCountFunction:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example +demonstrating on how Application can easily store `state` in Pulsar Functions. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). + + + + +```python + +from pulsar import Function + +class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received string into multiple words on space. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/functions-metrics.md b/site2/website/versioned_docs/version-2.8.x/functions-metrics.md new file mode 100644 index 0000000000000..8add669316092 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-metrics.md @@ -0,0 +1,7 @@ +--- +id: functions-metrics +title: Metrics for Pulsar Functions +sidebar_label: "Metrics" +original_id: functions-metrics +--- + diff --git a/site2/website/versioned_docs/version-2.8.x/functions-overview.md b/site2/website/versioned_docs/version-2.8.x/functions-overview.md new file mode 100644 index 0000000000000..816d301e0fd0e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-overview.md @@ -0,0 +1,209 @@ +--- +id: functions-overview +title: Pulsar Functions overview +sidebar_label: "Overview" +original_id: functions-overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic. + + +## Goals +With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://heron.incubator.apache.org/), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: + +* Developer productivity (language-native vs Pulsar Functions SDK functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations +Pulsar Functions are inspired by (and take cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions can be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus. + +## Programming model +Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. + + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](functions-develop.md#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#word-count-example) + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +You can use Pulsar Functions to set up the following processing chain: + +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. + + +### Word count example + +If you implement the classic word count example using Pulsar Functions, it looks something like this: + +![Pulsar Functions word count example](/assets/pulsar-functions-word-count.png) + +To write the function in Java with [Pulsar Functions SDK for Java](functions-develop.md#available-apis), you can write the function as follows. + +```java + +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } +} + +``` + +Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count + +``` + +### Content-based routing example + +Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. + +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. + +![Pulsar Functions routing example](/assets/pulsar-functions-routing-example.png) + +If you implement this routing functionality in Python, it looks something like this: + +```python + +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + @staticmethod + def is_fruit(item): + return item in [b"apple", b"orange", b"pear", b"other fruits..."] + + @staticmethod + def is_vegetable(item): + return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) + +``` + +If this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --py ~/router.py \ + --classname router.RoutingFunction \ + --tenant public \ + --namespace default \ + --name route-fruit-veg \ + --inputs persistent://public/default/basket-items + +``` + +### Functions, messages and message types +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. +* [Schema Registry](functions-develop.md#schema-registry) +* [SerDe](functions-develop.md#serde) + + +## Fully Qualified Function Name (FQFN) +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: + +```http + +tenant/namespace/name + +``` + +FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. + +## Supported languages +Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop.md). + +## Processing guarantees +Pulsar Functions provide three different messaging semantics that you can apply to any function. + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). +**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). +**Effectively-once** delivery | Each message sent to the function will have one output associated with it. + + +### Apply processing guarantees to a function +You can set the processing guarantees for a Pulsar Function when you create the Function. The following [`pulsar-function create`](reference-pulsar-admin.md#create-1) command creates a function with effectively-once guarantees applied. + +```bash + +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs + +``` + +The available options for `--processing-guarantees` are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. + +### Update the processing guarantees of a function +You can change the processing guarantees applied to a function using the [`update`](reference-pulsar-admin.md#update-1) command. The following is an example. + +```bash + +$ bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/functions-package.md b/site2/website/versioned_docs/version-2.8.x/functions-package.md new file mode 100644 index 0000000000000..a995d5c158877 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-package.md @@ -0,0 +1,493 @@ +--- +id: functions-package +title: Package Pulsar Functions +sidebar_label: "How-to: Package" +original_id: functions-package +--- + +You can package Pulsar functions in Java, Python, and Go. Packaging the window function in Java is the same as [packaging a function in Java](#java). + +:::note + +Currently, the window function is not available in Python and Go. + +::: + +## Prerequisite + +Before running a Pulsar function, you need to start Pulsar. You can [run a standalone Pulsar in Docker](getting-started-docker.md), or [run Pulsar in Kubernetes](getting-started-helm.md). + +To check whether the Docker image starts, you can use the `docker ps` command. + +## Java + +To package a function in Java, complete the following steps. + +1. Create a new maven project with a pom file. In the following code sample, the value of `mainClass` is your package name. + + ```Java + + + + 4.0.0 + + java-function + java-function + 1.0-SNAPSHOT + + + + org.apache.pulsar + pulsar-functions-api + 2.6.0 + + + + + + + maven-assembly-plugin + + false + + jar-with-dependencies + + + + org.example.test.ExclamationFunction + + + + + + make-assembly + package + + assembly + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + + ``` + +2. Write a Java function. + + ``` + + package org.example.test; + + import java.util.function.Function; + + public class ExclamationFunction implements Function { + @Override + public String apply(String s) { + return "This is my function!"; + } + } + + ``` + + For the imported package, you can use one of the following interfaces: + - Function interface provided by Java 8: `java.util.function.Function` + - Pulsar Function interface: `org.apache.pulsar.functions.api.Function` + + The main difference between the two interfaces is that the `org.apache.pulsar.functions.api.Function` interface provides the context interface. When you write a function and want to interact with it, you can use context to obtain a wide variety of information and functionality for Pulsar Functions. + + The following example uses `org.apache.pulsar.functions.api.Function` interface with context. + + ``` + + package org.example.functions; + import org.apache.pulsar.functions.api.Context; + import org.apache.pulsar.functions.api.Function; + + import java.util.Arrays; + public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } + } + + ``` + +3. Package the Java function. + + ```bash + + mvn package + + ``` + + After the Java function is packaged, a `target` directory is created automatically. Open the `target` directory to check if there is a JAR package similar to `java-function-1.0-SNAPSHOT.jar`. + + +4. Run the Java function. + + (1) Copy the packaged jar file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Java function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname org.example.test.ExclamationFunction \ + --jar java-function-1.0-SNAPSHOT.jar \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name JavaFunction + + ``` + + The following log indicates that the Java function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Python + +Python Function supports the following three formats: + +- One python file +- ZIP file +- PIP + +### One python file + +To package a function with **one python file** in Python, complete the following steps. + +1. Write a Python function. + + ``` + + from pulsar import Function // import the Function module from Pulsar + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + + ``` + + In this example, when you write a Python function, you need to inherit the Function class and implement the `process()` method. + + `process()` mainly has two parameters: + + - `input` represents your input. + + - `context` represents an interface exposed by the Pulsar Function. You can get the attributes in the Python function based on the provided context object. + +2. Install a Python client. + + The implementation of a Python function depends on the Python client, so before deploying a Python function, you need to install the corresponding version of the Python client. + + ```bash + + pip install pulsar-client==2.6.0 + + ``` + +3. Run the Python Function. + + (1) Copy the Python function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname . \ + --py \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### ZIP file + +To package a function with the **ZIP file** in Python, complete the following steps. + +1. Prepare the ZIP file. + + The following is required when packaging the ZIP file of the Python Function. + + ```text + + Assuming the zip file is named as `func.zip`, unzip the `func.zip` folder: + "func/src" + "func/requirements.txt" + "func/deps" + + ``` + + Take [exclamation.zip](https://github.com/apache/pulsar/tree/master/tests/docker-images/latest-version-image/python-examples) as an example. The internal structure of the example is as follows. + + ```text + + . + ├── deps + │   └── sh-1.12.14-py2.py3-none-any.whl + └── src + └── exclamation.py + + ``` + +2. Run the Python Function. + + (1) Copy the ZIP file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname exclamation \ + --py \ + --inputs persistent://public/default/in-topic \ + --output persistent://public/default/out-topic \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### PIP + +The PIP method is only supported in Kubernetes runtime. To package a function with **PIP** in Python, complete the following steps. + +1. Configure the `functions_worker.yml` file. + + ```text + + #### Kubernetes Runtime #### + installUserCodeDependencies: true + + ``` + +2. Write your Python Function. + + ``` + + from pulsar import Function + import js2xml + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + // add your logic + return input + '!' + + ``` + + You can introduce additional dependencies. When Python Function detects that the file currently used is `whl` and the `installUserCodeDependencies` parameter is specified, the system uses the `pip install` command to install the dependencies required in Python Function. + +3. Generate the `whl` file. + + ```shell script + + $ cd $PULSAR_HOME/pulsar-functions/scripts/python + $ chmod +x generate.sh + $ ./generate.sh + # e.g: ./generate.sh /path/to/python /path/to/python/output 1.0.0 + + ``` + + The output is written in `/path/to/python/output`: + + ```text + + -rw-r--r-- 1 root staff 1.8K 8 27 14:29 pulsarfunction-1.0.0-py2-none-any.whl + -rw-r--r-- 1 root staff 1.4K 8 27 14:29 pulsarfunction-1.0.0.tar.gz + -rw-r--r-- 1 root staff 0B 8 27 14:29 pulsarfunction.whl + + ``` + +## Go + +To package a function in Go, complete the following steps. + +1. Write a Go function. + + Currently, Go function can be **only** implemented using SDK and the interface of the function is exposed in the form of SDK. Before using the Go function, you need to import "github.com/apache/pulsar/pulsar-function-go/pf". + + ``` + + import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" + ) + + func HandleRequest(ctx context.Context, input []byte) error { + fmt.Println(string(input) + "!") + return nil + } + + func main() { + pf.Start(HandleRequest) + } + + ``` + + You can use context to connect to the Go function. + + ``` + + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } + + ``` + + When writing a Go function, remember that + - In `main()`, you **only** need to register the function name to `Start()`. **Only** one function name is received in `Start()`. + - Go function uses Go reflection, which is based on the received function name, to verify whether the parameter list and returned value list are correct. The parameter list and returned value list **must be** one of the following sample functions: + + ``` + + func () + func () error + func (input) error + func () (output, error) + func (input) (output, error) + func (context.Context) error + func (context.Context, input) error + func (context.Context) (output, error) + func (context.Context, input) (output, error) + + ``` + +2. Build the Go function. + + ``` + + go build .go + + ``` + +3. Run the Go Function. + + (1) Copy the Go function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Go function with the following command. + + ``` + + ./bin/pulsar-admin functions localrun \ + --go [your go function path] + --inputs [input topics] \ + --output [output topic] \ + --tenant [default:public] \ + --namespace [default:default] \ + --name [custom unique go function name] + + ``` + + The following log indicates that the Go function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Start Functions in cluster mode +If you want to start a function in cluster mode, replace `localrun` with `create` in the commands above. The following log indicates that your function starts successfully. + + ```text + + "Created successfully" + + ``` + +For information about parameters on `--classname`, `--jar`, `--py`, `--go`, `--inputs`, run the command `./bin/pulsar-admin functions` or see [here](reference-pulsar-admin.md#functions). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/functions-runtime.md b/site2/website/versioned_docs/version-2.8.x/functions-runtime.md new file mode 100644 index 0000000000000..8fc2abff1e0e8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-runtime.md @@ -0,0 +1,409 @@ +--- +id: functions-runtime +title: Configure Functions runtime +sidebar_label: "Setup: Configure Functions runtime" +original_id: functions-runtime +--- + +You can use the following methods to run functions. + +- *Thread*: Invoke functions threads in functions worker. +- *Process*: Invoke functions in processes forked by functions worker. +- *Kubernetes*: Submit functions as Kubernetes StatefulSets by functions worker. + +:::note + +Pulsar supports adding labels to the Kubernetes StatefulSets and services while launching functions, which facilitates selecting the target Kubernetes objects. + +::: + +The differences of the thread and process modes are: +- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with functions worker. +- Process mode: when a function runs in process mode, it runs on the same machine that functions worker runs. + +## Configure thread runtime +It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.thread.ThreadRuntimeFactory +functionRuntimeFactoryConfigs: + threadGroupName: "Your Function Container Group" + +``` + +*Thread* runtime is only supported in Java function. + +## Configure process runtime +When you enable *Process* runtime, you do not need to configure anything. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.process.ProcessRuntimeFactory +functionRuntimeFactoryConfigs: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: + +``` + +*Process* runtime is supported in Java, Python, and Go functions. + +## Configure Kubernetes runtime + +When the functions worker generates Kubernetes manifests and apply the manifests, the Kubernetes runtime works. If you have run functions worker on Kubernetes, you can use the `serviceAccount` associated with the pod that the functions worker is running in. Otherwise, you can configure it to communicate with a Kubernetes cluster. + +The manifests, generated by the functions worker, include a `StatefulSet`, a `Service` (used to communicate with the pods), and a `Secret` for auth credentials (when applicable). The `StatefulSet` manifest (by default) has a single pod, with the number of replicas determined by the "parallelism" of the function. On pod boot, the pod downloads the function payload (via the functions worker REST API). The pod's container image is configurable, but must have the functions runtime. + +The Kubernetes runtime supports secrets, so you can create a Kubernetes secret and expose it as an environment variable in the pod. The Kubernetes runtime is extensible, you can implement classes and customize the way how to generate Kubernetes manifests, how to pass auth data to pods, and how to integrate secrets. + +:::tip + +For the rules of translating Pulsar object names into Kubernetes resource labels, see [here](admin-api-overview.md#how-to-define-pulsar-resource-names-when-running-pulsar-in-kubernetes). + +::: + +### Basic configuration + +It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.kubernetes.KubernetesRuntimeFactory +functionRuntimeFactoryConfigs: + # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker + k8Uri: + # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty + jobNamespace: + # The Kubernetes pod name to run the function instances. It is set to + # `pf----` if this setting is left to be empty + jobName: + # the docker image to run function instance. by default it is `apachepulsar/pulsar` + pulsarDockerImageName: + # the docker image to run function instance according to different configurations provided by users. + # By default it is `apachepulsar/pulsar`. + # e.g: + # functionDockerImages: + # JAVA: JAVA_IMAGE_NAME + # PYTHON: PYTHON_IMAGE_NAME + # GO: GO_IMAGE_NAME + functionDockerImages: + # "The image pull policy for image used to run function instance. By default it is `IfNotPresent` + imagePullPolicy: IfNotPresent + # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. + # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly + pulsarRootDir: + # The config admin CLI allows users to customize the configuration of the admin cli tool, such as: + # `/bin/pulsar-admin and /bin/pulsarctl`. By default it is `/bin/pulsar-admin`. If you want to use `pulsarctl` + # you need to set this setting accordingly + configAdminCLI: + # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, + # setting this to true is let function worker to submit functions to the same k8s cluster as function worker + # is running. setting this to false if your function worker is not running as a k8 pod. + submittingInsidePod: false + # setting the pulsar service url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar service url configured in worker service + pulsarServiceUrl: + # setting the pulsar admin url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar admin url configured in worker service + pulsarAdminUrl: + # The flag indicates to install user code dependencies. (applied to python package) + installUserCodeDependencies: + # The repository that pulsar functions use to download python dependencies + pythonDependencyRepository: + # The repository that pulsar functions use to download extra python dependencies + pythonExtraDependencyRepository: + # the custom labels that function worker uses to select the nodes for pods + customLabels: + # The expected metrics collection interval, in seconds + expectedMetricsCollectionInterval: 30 + # Kubernetes Runtime will periodically checkback on + # this configMap if defined and if there are any changes + # to the kubernetes specific stuff, we apply those changes + changeConfigMap: + # The namespace for storing change config map + changeConfigMapNamespace: + # The ratio cpu request and cpu limit to be set for a function/source/sink. + # The formula for cpu request is cpuRequest = userRequestCpu / cpuOverCommitRatio + cpuOverCommitRatio: 1.0 + # The ratio memory request and memory limit to be set for a function/source/sink. + # The formula for memory request is memoryRequest = userRequestMemory / memoryOverCommitRatio + memoryOverCommitRatio: 1.0 + # The port inside the function pod which is used by the worker to communicate with the pod + grpcPort: 9093 + # The port inside the function pod on which prometheus metrics are exposed + metricsPort: 9094 + # The directory inside the function pod where nar packages will be extracted + narExtractionDirectory: + # The classpath where function instance files stored + functionInstanceClassPath: + # the directory for dropping extra function dependencies + # if it is not an absolute path, it is relative to `pulsarRootDir` + extraFunctionDependenciesDir: + # Additional memory padding added on top of the memory requested by the function per on a per instance basis + percentMemoryPadding: 10 + # The duration (in seconds) before the StatefulSet is deleted after a function stops or restarts. + # Value must be a non-negative integer. 0 indicates the StatefulSet is deleted immediately. + # Default is 5 seconds. + gracePeriodSeconds: 5 + +``` + +:::note + +`gracePeriodSeconds` is only available in 2.8.2 and later versions. + +::: + +If you run functions worker embedded in a broker on Kubernetes, you can use the default settings. + +### Run standalone functions worker on Kubernetes + +If you run functions worker standalone (that is, not embedded) on Kubernetes, you need to configure `pulsarSerivceUrl` to be the URL of the broker and `pulsarAdminUrl` as the URL to the functions worker. + +For example, both Pulsar brokers and Function Workers run in the `pulsar` K8S namespace. The brokers have a service called `brokers` and the functions worker has a service called `func-worker`. The settings are as follows: + +```yaml + +pulsarServiceUrl: pulsar://broker.pulsar:6650 // or pulsar+ssl://broker.pulsar:6651 if using TLS +pulsarAdminUrl: http://func-worker.pulsar:8080 // or https://func-worker:8443 if using TLS + +``` + +### Run RBAC in Kubernetes clusters + +If you run RBAC in your Kubernetes cluster, make sure that the service account you use for running functions workers (or brokers, if functions workers run along with brokers) have permissions on the following Kubernetes APIs. + +- services +- configmaps +- pods +- apps.statefulsets + +The following is sufficient: + +```yaml + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: functions-worker +rules: +- apiGroups: [""] + resources: + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: functions-worker +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: functions-worker +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: functions-worker +subjectsKubernetesSec: +- kind: ServiceAccount + name: functions-worker + +``` + +If the service-account is not properly configured, an error message similar to this is displayed: + +```bash + +22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar +io.kubernetes.client.ApiException: Forbidden + at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] + at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] + +``` + +### Integrate Kubernetes secrets + +In order to safely distribute secrets, Pulsar Functions can reference Kubernetes secrets. To enable this, set the `secretsProviderConfiguratorClassName` to `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator`. + +You can create a secret in the namespace where your functions are deployed. For example, you deploy functions to the `pulsar-func` Kubernetes namespace, and you have a secret named `database-creds` with a field name `password`, which you want to mount in the pod as an environment variable called `DATABASE_PASSWORD`. The following functions configuration enables you to reference that secret and mount the value as an environment variable in the pod. + +```Yaml + +tenant: "mytenant" +namespace: "mynamespace" +name: "myfunction" +topicName: "persistent://mytenant/mynamespace/myfuncinput" +className: "com.company.pulsar.myfunction" + +secrets: + # the secret will be mounted from the `password` field in the `database-creds` secret as an env var called `DATABASE_PASSWORD` + DATABASE_PASSWORD: + path: "database-creds" + key: "password" + +``` + +### Enable token authentication + +When you enable authentication for your Pulsar cluster, you need a mechanism for the pod running your function to authenticate with the broker. + +The `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface provides support for any authentication mechanism. The `functionAuthProviderClassName` in `function-worker.yml` is used to specify your path to this implementation. + +Pulsar includes an implementation of this interface for token authentication, and distributes the certificate authority via the same implementation. The configuration is similar as follows: + +```Yaml + +functionAuthProviderClassName: org.apache.pulsar.functions.auth.KubernetesSecretsTokenAuthProvider + +``` + +For token authentication, the functions worker captures the token that is used to deploy (or update) the function. The token is saved as a secret and mounted into the pod. + +For custom authentication or TLS, you need to implement this interface or use an alternative mechanism to provide authentication. If you use token authentication and TLS encryption to secure the communication with the cluster, Pulsar passes your certificate authority (CA) to the client, so the client obtains what it needs to authenticate the cluster, and trusts the cluster with your signed certificate. + +:::note + +If you use tokens that expire when deploying functions, these tokens will expire. + +::: + +### Run clusters with authentication + +When you run a functions worker in a standalone process (that is, not embedded in the broker) in a cluster with authentication, you must configure your functions worker to interact with the broker and authenticate incoming requests. So you need to configure properties that the broker requires for authentication or authorization. + +For example, if you use token authentication, you need to configure the following properties in the `function-worker.yml` file. + +```Yaml + +clientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationToken +clientAuthenticationParameters: file:///etc/pulsar/token/admin-token.txt +configurationStoreServers: zookeeper-cluster:2181 # auth requires a connection to zookeeper +authenticationProviders: + - "org.apache.pulsar.broker.authentication.AuthenticationProviderToken" +authorizationEnabled: true +authenticationEnabled: true +superUserRoles: + - superuser + - proxy +properties: + tokenSecretKey: file:///etc/pulsar/jwt/secret # if using a secret token, key file must be DER-encoded + tokenPublicKey: file:///etc/pulsar/jwt/public.key # if using public/private key tokens, key file must be DER-encoded + +``` + +:::note + +You must configure both the Function Worker authorization or authentication for the server to authenticate requests and configure the client to be authenticated to communicate with the broker. + +::: + +### Customize Kubernetes runtime + +The Kubernetes integration enables you to implement a class and customize how to generate manifests. You can configure it by setting `runtimeCustomizerClassName` in the `functions-worker.yml` file and use the fully qualified class name. You must implement the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer` interface. + +The functions (and sinks/sources) API provides a flag, `customRuntimeOptions`, which is passed to this interface. + +To initialize the `KubernetesManifestCustomizer`, you can provide `runtimeCustomizerConfig` in the `functions-worker.yml` file. `runtimeCustomizerConfig` is passed to the `public void initialize(Map config)` function of the interface. `runtimeCustomizerConfig`is different from the `customRuntimeOptions` as `runtimeCustomizerConfig` is the same across all functions. If you provide both `runtimeCustomizerConfig` and `customRuntimeOptions`, you need to decide how to manage these two configurations in your implementation of `KubernetesManifestCustomizer`. + +Pulsar includes a built-in implementation. To use the basic implementation, set `runtimeCustomizerClassName` to `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer`. The built-in implementation initialized with `runtimeCustomizerConfig` enables you to pass a JSON document as `customRuntimeOptions` with certain properties to augment, which decides how the manifests are generated. If both `runtimeCustomizerConfig` and `customRuntimeOptions` are provided, `BasicKubernetesManifestCustomizer` uses `customRuntimeOptions` to override the configuration if there are conflicts in these two configurations. + +Below is an example of `customRuntimeOptions`. + +```json + +{ + "jobName": "jobname", // the k8s pod name to run this function instance + "jobNamespace": "namespace", // the k8s namespace to run this function in + "extractLabels": { // extra labels to attach to the statefulSet, service, and pods + "extraLabel": "value" + }, + "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods + "extraAnnotation": "value" + }, + "nodeSelectorLabels": { // node selector labels to add on to the pod spec + "customLabel": "value" + }, + "tolerations": [ // tolerations to add to the pod spec + { + "key": "custom-key", + "value": "value", + "effect": "NoSchedule" + } + ], + "resourceRequirements": { // values for cpu and memory should be defined as described here: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container + "requests": { + "cpu": 1, + "memory": "4G" + }, + "limits": { + "cpu": 2, + "memory": "8G" + } + } +} + +``` + +## Run clusters with geo-replication + +If you run multiple clusters tied together with geo-replication, it is important to use a different function namespace for each cluster. Otherwise, the function shares a namespace and potentially schedule across clusters. + +For example, if you have two clusters: `east-1` and `west-1`, you can configure the functions workers for `east-1` and `west-1` perspectively as follows. + +```Yaml + +pulsarFunctionsCluster: east-1 +pulsarFunctionsNamespace: public/functions-east-1 + +``` + +```Yaml + +pulsarFunctionsCluster: west-1 +pulsarFunctionsNamespace: public/functions-west-1 + +``` + +This ensures the two different Functions Workers use distinct sets of topics for their internal coordination. + +## Configure standalone functions worker + +When configuring a standalone functions worker, you need to configure properties that the broker requires, especially if you use TLS. And then Functions Worker can communicate with the broker. + +You need to configure the following required properties. + +```Yaml + +workerPort: 8080 +workerPortTls: 8443 # when using TLS +tlsCertificateFilePath: /etc/pulsar/tls/tls.crt # when using TLS +tlsKeyFilePath: /etc/pulsar/tls/tls.key # when using TLS +tlsTrustCertsFilePath: /etc/pulsar/tls/ca.crt # when using TLS +pulsarServiceUrl: pulsar://broker.pulsar:6650/ # or pulsar+ssl://pulsar-prod-broker.pulsar:6651/ when using TLS +pulsarWebServiceUrl: http://broker.pulsar:8080/ # or https://pulsar-prod-broker.pulsar:8443/ when using TLS +useTls: true # when using TLS, critical! + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/functions-worker.md b/site2/website/versioned_docs/version-2.8.x/functions-worker.md new file mode 100644 index 0000000000000..1ad643cee8431 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/functions-worker.md @@ -0,0 +1,386 @@ +--- +id: functions-worker +title: Deploy and manage functions worker +sidebar_label: "Setup: Pulsar Functions Worker" +original_id: functions-worker +--- +Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime.md). + +Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either based on your requirements. +- [run with brokers](#run-functions-worker-with-brokers) +- [run it separately](#run-functions-worker-separately) in a different broker + +:::note + +The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. + +::: + +## Run Functions-worker with brokers + +The following diagram illustrates the deployment of functions-workers running along with brokers. + +![assets/functions-worker-corun.png](/assets/functions-worker-corun.png) + +To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. + +```conf + +functionsWorkerEnabled=true + +``` + +If the `functionsWorkerEnabled` is set to `true`, the functions-worker is started as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. + +Before you run Functions-worker with broker, you have to configure Functions-worker, and then start it with brokers. + +### Configure Functions-Worker to run with brokers +In this mode, most of the settings are already inherited from your broker configuration (for example, configurationStore settings, authentication settings, and so on) since `functions-worker` is running as part of the broker. + +Pay attention to the following required settings when configuring functions-worker in this mode. + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be larger than `2`. +- `initializedDlogMetadata`: Whether to initialize distributed log metadata in runtime. If it is set to `true`, you must ensure that it has been initialized by `bin/pulsar initialize-cluster-metadata` command. + +If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. + +- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. +- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. +- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. + +### Configure Stateful-Functions to run with broker + +If you want to use Stateful-Functions related functions (for example, `putState()` and `queryState()` related interfaces), follow steps below. + +1. Enable the **streamStorage** service in the BookKeeper. + + Currently, the service uses the NAR package, so you need to set the configuration in `bookkeeper.conf`. + + ```text + + extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent + + ``` + + After starting bookie, use the following methods to check whether the streamStorage service is started correctly. + + Input: + + ```shell + + telnet localhost 4181 + + ``` + + Output: + + ```text + + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + + ``` + +2. Turn on this function in `functions_worker.yml`. + + ```text + + stateStorageServiceUrl: bk://:4181 + + ``` + + `bk-service-url` is the service URL pointing to the BookKeeper table service. + +### Start Functions-worker with broker + +Once you have configured the `functions_worker.yml` file, you can start or restart your broker. + +And then you can use the following command to verify if `functions-worker` is running well. + +```bash + +curl :8080/admin/v2/worker/cluster + +``` + +After entering the command above, a list of active function workers in the cluster is returned. The output is similar to the following. + +```json + +[{"workerId":"","workerHostname":"","port":8080}] + +``` + +## Run Functions-worker separately + +This section illustrates how to run `functions-worker` as a separate process in separate machines. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated.png) + +:::note + +In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. Also, while accessing the `functions-worker` to manage any of the functions, the `pulsar-admin` CLI tool or any of the clients should use the `workerHostname` and `workerPort` that you set in [Worker parameters](#worker-parameters) to generate an `--admin-url`. + +::: + +### Configure Functions-worker to run separately + +To run function-worker separately, you have to configure the following parameters. + +#### Worker parameters + +- `workerId`: The type is string. It is unique across clusters, which is used to identify a worker machine. +- `workerHostname`: The hostname of the worker machine. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. + +#### Function package parameter + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. + +#### Function metadata parameter + +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulsar web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. + +- `brokerClientAuthenticationEnabled`: Whether to enable the broker client authentication used by function workers to talk to brokers. +- `clientAuthenticationPlugin`: The authentication plugin to be used by the Pulsar client used in worker service. +- `clientAuthenticationParameters`: The authentication parameter to be used by the Pulsar client used in worker service. + +#### Security settings + +If you want to enable security on functions workers, you *should*: +- [Enable TLS transport encryption](#enable-tls-transport-encryption) +- [Enable Authentication Provider](#enable-authentication-provider) +- [Enable Authorization Provider](#enable-authorization-provider) +- [Enable End-to-End Encryption](#enable-end-to-end-encryption) + +##### Enable TLS transport encryption + +To enable TLS transport encryption, configure the following settings. + +``` + +useTLS: true +pulsarServiceUrl: pulsar+ssl://localhost:6651/ +pulsarWebServiceUrl: https://localhost:8443 + +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem + +// The path to trusted certificates used by the Pulsar client to authenticate with Pulsar brokers +brokerClientTrustCertsFilePath: /path/to/ca.cert.pem + +``` + +For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + +##### Enable Authentication Provider + +To enable authentication on Functions Worker, you need to configure the following settings. + +:::note + +Substitute the *providers list* with the providers you want to enable. + +::: + +``` + +authenticationEnabled: true +authenticationProviders: [ provider1, provider2 ] + +``` + +For *TLS Authentication* provider, follow the example below to add the necessary settings. +See [TLS Authentication](security-tls-authentication.md) for more details. + +``` + +brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters: tlsCertFile:/path/to/admin.cert.pem,tlsKeyFile:/path/to/admin.key-pk8.pem + +authenticationEnabled: true +authenticationProviders: ['org.apache.pulsar.broker.authentication.AuthenticationProviderTls'] + +``` + +For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasBrokerSectionName` +under `properties` if needed. + +``` + +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasBrokerSectionName: Broker + +``` + +For *Token Authentication* provider, add necessary settings for `properties` if needed. +See [Token Authentication](security-jwt.md) for more details. +Note: key files must be DER-encoded + +``` + +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file:///path/to/public.key + +``` + +##### Enable Authorization Provider + +To enable authorization on Functions Worker, you need to configure `authorizationEnabled`, `authorizationProvider` and `configurationStoreServers`. The authentication provider connects to `configurationStoreServers` to receive namespace policies. + +```yaml + +authorizationEnabled: true +authorizationProvider: org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider +configurationStoreServers: + +``` + +You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. + +```yaml + +superUserRoles: + - role1 + - role2 + - role3 + +``` + +##### Enable End-to-End Encryption + +You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +To enable End-to-End encryption on Functions Worker, you can set it by specifying `--producer-config` in the command line terminal, for more information, please refer to [here](security-encryption.md). + +We include the relevant configuration information of `CryptoConfig` into `ProducerConfig`. The specific configurable field information about `CryptoConfig` is as follows: + +```text + +public class CryptoConfig { + private String cryptoKeyReaderClassName; + private Map cryptoKeyReaderConfig; + + private String[] encryptionKeys; + private ProducerCryptoFailureAction producerCryptoFailureAction; + + private ConsumerCryptoFailureAction consumerCryptoFailureAction; +} + +``` + +- `producerCryptoFailureAction`: define the action if producer fail to encrypt data one of `FAIL`, `SEND`. +- `consumerCryptoFailureAction`: define the action if consumer fail to decrypt data one of `FAIL`, `DISCARD`, `CONSUME`. + +#### BookKeeper Authentication + +If authentication is enabled on the BookKeeper cluster, you need configure the BookKeeper authentication settings as follows: + +- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. + +### Start Functions-worker + +Once you have finished configuring the `functions_worker.yml` configuration file, you can start a `functions-worker` in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +bin/pulsar-daemon start functions-worker + +``` + +You can also start `functions-worker` in the foreground by using `pulsar` CLI tool: + +```bash + +bin/pulsar functions-worker + +``` + +### Configure Proxies for Functions-workers + +When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served +by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. +Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. + +In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. + +If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](http://pulsar.apache.org/docs/en/administration-proxy/) to +start proxies. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated-proxy.png) + +To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: + +```conf + +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= + +``` + +## Compare the Run-with-Broker and Run-separately modes + +As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. + +Use which mode for your cases, refer to the following guidelines to determine. + +Use the `Run-with-Broker` mode in the following cases: +- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; +- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). + +Use the `Run-separately` mode in the following cases: +- a) you don't have a Kubernetes cluster; +- b) if you want to run functions and brokers separately. + +## Troubleshooting + +**Error message: Namespace missing local cluster name in clusters list** + +``` + +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] + +``` + +The error message prompts when either of the cases occurs: +- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; +- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happens, follow the instructions below to fix the problem: + +1. Disable Functions Worker by setting `functionsWorkerEnabled=false`, and restart brokers. + +2. Get the current clusters list of `public/functions` namespace. + +```bash + +bin/pulsar-admin namespaces get-clusters public/functions + +``` + +3. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. + +```bash + +bin/pulsar-admin namespaces set-clusters --clusters , public/functions + +``` + +4. After setting the cluster successfully, enable functions worker by setting `functionsWorkerEnabled=true`. + +5. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file, and restart brokers. diff --git a/site2/website/versioned_docs/version-2.8.x/getting-started-concepts-and-architecture.md b/site2/website/versioned_docs/version-2.8.x/getting-started-concepts-and-architecture.md new file mode 100644 index 0000000000000..fe9c3fbc553b2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/getting-started-concepts-and-architecture.md @@ -0,0 +1,16 @@ +--- +id: concepts-architecture +title: Pulsar concepts and architecture +sidebar_label: "Concepts and architecture" +original_id: concepts-architecture +--- + + + + + + + + + + diff --git a/site2/website/versioned_docs/version-2.8.x/getting-started-docker.md b/site2/website/versioned_docs/version-2.8.x/getting-started-docker.md new file mode 100644 index 0000000000000..4f20971d75330 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/getting-started-docker.md @@ -0,0 +1,176 @@ +--- +id: getting-started-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: "Run Pulsar in Docker" +original_id: getting-started-docker +--- + +For local development and testing, you can run Pulsar in standalone +mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) +and follow the instructions for your OS. + +## Start Pulsar in Docker + +* For MacOS, Linux, and Windows: + + ```shell + + $ docker run -it \ + -p 6650:6650 \ + -p 8080:8080 \ + --mount source=pulsardata,target=/pulsar/data \ + --mount source=pulsarconf,target=/pulsar/conf \ + apachepulsar/pulsar:@pulsar:version@ \ + bin/pulsar standalone + + ``` + +A few things to note about this command: + * The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every +time the container is restarted. For details on the volumes you can use `docker volume inspect ` + * For Docker on Windows make sure to configure it to use Linux containers + +If you start Pulsar successfully, you will see `INFO`-level log messages like this: + +``` + +2017-08-09 22:34:04,030 - INFO - [main:WebService@213] - Web Service started at http://127.0.0.1:8080 +2017-08-09 22:34:04,038 - INFO - [main:PulsarService@335] - messaging service is ready, bootstrap service on port=8080, broker url=pulsar://127.0.0.1:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@4db60246 +... + +``` + +:::tip + +When you start a local standalone cluster, a `public/default` namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +::: + +## Use Pulsar in Docker + +Pulsar offers client libraries for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) +and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs to interact with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example will guide you get started with Pulsar quickly by using the [Python](client-libraries-python.md) +client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell + +$ pip install pulsar-client + +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() + +``` + +### Produce a message + +Now start a producer to send some test messages: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() + +``` + +## Get the topic statistics + +In Pulsar, you can use REST, Java, or command-line tools to control every aspect of the system. +For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell + +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool + +``` + +The output is something like this: + +```json + +{ + "averageMsgSize": 0.0, + "msgRateIn": 0.0, + "msgRateOut": 0.0, + "msgThroughputIn": 0.0, + "msgThroughputOut": 0.0, + "publishers": [ + { + "address": "/172.17.0.1:35048", + "averageMsgSize": 0.0, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 20:59:34.621+0000", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "producerId": 0, + "producerName": "standalone-0-1" + } + ], + "replication": {}, + "storageSize": 16, + "subscriptions": { + "my-sub": { + "blockedSubscriptionOnUnackedMsgs": false, + "consumers": [ + { + "address": "/172.17.0.1:35064", + "availablePermits": 996, + "blockedConsumerOnUnackedMsgs": false, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 21:05:39.222+0000", + "consumerName": "166111", + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "unackedMessages": 0 + } + ], + "msgBacklog": 0, + "msgRateExpired": 0.0, + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "type": "Exclusive", + "unackedMessages": 0 + } + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/getting-started-helm.md b/site2/website/versioned_docs/version-2.8.x/getting-started-helm.md new file mode 100644 index 0000000000000..440087c275c05 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/getting-started-helm.md @@ -0,0 +1,441 @@ +--- +id: getting-started-helm +title: Get started in Kubernetes +sidebar_label: "Run Pulsar in Kubernetes" +original_id: getting-started-helm +--- + +This section guides you through every step of installing and running Apache Pulsar with Helm on Kubernetes quickly, including the following sections: + +- Install the Apache Pulsar on Kubernetes using Helm +- Start and stop Apache Pulsar +- Create topics using `pulsar-admin` +- Produce and consume messages using Pulsar clients +- Monitor Apache Pulsar status with Prometheus and Grafana + +For deploying a Pulsar cluster for production usage, read the documentation on [how to configure and install a Pulsar Helm chart](helm-deploy.md). + +## Prerequisite + +- Kubernetes server 1.14.0+ +- kubectl 1.14.0+ +- Helm 3.0+ + +:::tip + +For the following steps, step 2 and step 3 are for **developers** and step 4 and step 5 are for **administrators**. + +::: + +## Step 0: Prepare a Kubernetes cluster + +Before installing a Pulsar Helm chart, you have to create a Kubernetes cluster. You can follow [the instructions](helm-prepare.md) to prepare a Kubernetes cluster. + +We use [Minikube](https://minikube.sigs.k8s.io/docs/start/) in this quick start guide. To prepare a Kubernetes cluster, follow these steps: + +1. Create a Kubernetes cluster on Minikube. + + ```bash + + minikube start --memory=8192 --cpus=4 --kubernetes-version= + + ``` + + The `` can be any [Kubernetes version supported by your Minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/), such as `v1.16.1`. + +2. Set `kubectl` to use Minikube. + + ```bash + + kubectl config use-context minikube + + ``` + +3. To use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with the local Kubernetes cluster on Minikube, enter the command below: + + ```bash + + minikube dashboard + + ``` + + The command automatically triggers opening a webpage in your browser. + +## Step 1: Install Pulsar Helm chart + +0. Add Pulsar charts repo. + + ```bash + + helm repo add apache https://pulsar.apache.org/charts + + ``` + + ```bash + + helm repo update + + ``` + +1. Clone the Pulsar Helm chart repository. + + ```bash + + git clone https://github.com/apache/pulsar-helm-chart + cd pulsar-helm-chart + + ``` + +2. Run the script `prepare_helm_release.sh` to create secrets required for installing the Apache Pulsar Helm chart. The username `pulsar` and password `pulsar` are used for logging into the Grafana dashboard and Pulsar Manager. + + > **NOTE** + > When running the script, you can use `-n` to specify the Kubernetes namespace where the Pulsar Helm chart is installed, `-k` to define the Pulsar Helm release name, and `-c` to create the Kubernetes namespace. For more information about the script, run `./scripts/pulsar/prepare_helm_release.sh --help`. + + ```bash + + ./scripts/pulsar/prepare_helm_release.sh \ + -n pulsar \ + -k pulsar-mini \ + -c + + ``` + +3. Use the Pulsar Helm chart to install a Pulsar cluster to Kubernetes. + + > **NOTE** + > You need to specify `--set initialize=true` when installing Pulsar the first time. This command installs and starts Apache Pulsar. + + ```bash + + helm install \ + --values examples/values-minikube.yaml \ + --set initialize=true \ + --namespace pulsar \ + pulsar-mini apache/pulsar + + ``` + +4. Check the status of all pods. + + ```bash + + kubectl get pods -n pulsar + + ``` + + If all pods start up successfully, you can see that the `STATUS` is changed to `Running` or `Completed`. + + **Output** + + ```bash + + NAME READY STATUS RESTARTS AGE + pulsar-mini-bookie-0 1/1 Running 0 9m27s + pulsar-mini-bookie-init-5gphs 0/1 Completed 0 9m27s + pulsar-mini-broker-0 1/1 Running 0 9m27s + pulsar-mini-grafana-6b7bcc64c7-4tkxd 1/1 Running 0 9m27s + pulsar-mini-prometheus-5fcf5dd84c-w8mgz 1/1 Running 0 9m27s + pulsar-mini-proxy-0 1/1 Running 0 9m27s + pulsar-mini-pulsar-init-t7cqt 0/1 Completed 0 9m27s + pulsar-mini-pulsar-manager-9bcbb4d9f-htpcs 1/1 Running 0 9m27s + pulsar-mini-toolset-0 1/1 Running 0 9m27s + pulsar-mini-zookeeper-0 1/1 Running 0 9m27s + + ``` + +5. Check the status of all services in the namespace `pulsar`. + + ```bash + + kubectl get services -n pulsar + + ``` + + **Output** + + ```bash + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + pulsar-mini-bookie ClusterIP None 3181/TCP,8000/TCP 11m + pulsar-mini-broker ClusterIP None 8080/TCP,6650/TCP 11m + pulsar-mini-grafana LoadBalancer 10.106.141.246 3000:31905/TCP 11m + pulsar-mini-prometheus ClusterIP None 9090/TCP 11m + pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 11m + pulsar-mini-pulsar-manager LoadBalancer 10.103.192.175 9527:30190/TCP 11m + pulsar-mini-toolset ClusterIP None 11m + pulsar-mini-zookeeper ClusterIP None 2888/TCP,3888/TCP,2181/TCP 11m + + ``` + +## Step 2: Use pulsar-admin to create Pulsar tenants/namespaces/topics + +`pulsar-admin` is the CLI (command-Line Interface) tool for Pulsar. In this step, you can use `pulsar-admin` to create resources, including tenants, namespaces, and topics. + +1. Enter the `toolset` container. + + ```bash + + kubectl exec -it -n pulsar pulsar-mini-toolset-0 -- /bin/bash + + ``` + +2. In the `toolset` container, create a tenant named `apache`. + + ```bash + + bin/pulsar-admin tenants create apache + + ``` + + Then you can list the tenants to see if the tenant is created successfully. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + You should see a similar output as below. The tenant `apache` has been successfully created. + + ```bash + + "apache" + "public" + "pulsar" + + ``` + +3. In the `toolset` container, create a namespace named `pulsar` in the tenant `apache`. + + ```bash + + bin/pulsar-admin namespaces create apache/pulsar + + ``` + + Then you can list the namespaces of tenant `apache` to see if the namespace is created successfully. + + ```bash + + bin/pulsar-admin namespaces list apache + + ``` + + You should see a similar output as below. The namespace `apache/pulsar` has been successfully created. + + ```bash + + "apache/pulsar" + + ``` + +4. In the `toolset` container, create a topic `test-topic` with `4` partitions in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics create-partitioned-topic apache/pulsar/test-topic -p 4 + + ``` + +5. In the `toolset` container, list all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics list-partitioned-topics apache/pulsar + + ``` + + Then you can see all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + "persistent://apache/pulsar/test-topic" + + ``` + +## Step 3: Use Pulsar client to produce and consume messages + +You can use the Pulsar client to create producers and consumers to produce and consume messages. + +By default, the Pulsar Helm chart exposes the Pulsar cluster through a Kubernetes `LoadBalancer`. In Minikube, you can use the following command to check the proxy service. + +```bash + +kubectl get services -n pulsar | grep pulsar-mini-proxy + +``` + +You will see a similar output as below. + +```bash + +pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 28m + +``` + +This output tells what are the node ports that Pulsar cluster's binary port and HTTP port are mapped to. The port after `80:` is the HTTP port while the port after `6650:` is the binary port. + +Then you can find the IP address and exposed ports of your Minikube server by running the following command. + +```bash + +minikube service pulsar-mini-proxy -n pulsar + +``` + +**Output** + +```bash + +|-----------|-------------------|-------------|-------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|-------------------------| +| pulsar | pulsar-mini-proxy | http/80 | http://172.17.0.4:32305 | +| | | pulsar/6650 | http://172.17.0.4:31816 | +|-----------|-------------------|-------------|-------------------------| +🏃 Starting tunnel for service pulsar-mini-proxy. +|-----------|-------------------|-------------|------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|------------------------| +| pulsar | pulsar-mini-proxy | | http://127.0.0.1:61853 | +| | | | http://127.0.0.1:61854 | +|-----------|-------------------|-------------|------------------------| + +``` + +At this point, you can get the service URLs to connect to your Pulsar client. Here are URL examples: + +``` + +webServiceUrl=http://127.0.0.1:61853/ +brokerServiceUrl=pulsar://127.0.0.1:61854/ + +``` + +Then you can proceed with the following steps: + +1. Download the Apache Pulsar tarball from the [downloads page](https://pulsar.apache.org/download/). + +2. Decompress the tarball based on your download file. + + ```bash + + tar -xf .tar.gz + + ``` + +3. Expose `PULSAR_HOME`. + + (1) Enter the directory of the decompressed download file. + + (2) Expose `PULSAR_HOME` as the environment variable. + + ```bash + + export PULSAR_HOME=$(pwd) + + ``` + +4. Configure the Pulsar client. + + In the `${PULSAR_HOME}/conf/client.conf` file, replace `webServiceUrl` and `brokerServiceUrl` with the service URLs you get from the above steps. + +5. Create a subscription to consume messages from `apache/pulsar/test-topic`. + + ```bash + + bin/pulsar-client consume -s sub apache/pulsar/test-topic -n 0 + + ``` + +6. Open a new terminal. In the new terminal, create a producer and send 10 messages to the `test-topic` topic. + + ```bash + + bin/pulsar-client produce apache/pulsar/test-topic -m "---------hello apache pulsar-------" -n 10 + + ``` + +7. Verify the results. + + - From the producer side + + **Output** + + The messages have been produced successfully. + + ```bash + + 18:15:15.489 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 10 messages successfully produced + + ``` + + - From the consumer side + + **Output** + + At the same time, you can receive the messages as below. + + ```bash + + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + + ``` + +## Step 4: Use Pulsar Manager to manage the cluster + +[Pulsar Manager](administration-pulsar-manager.md) is a web-based GUI management tool for managing and monitoring Pulsar. + +1. By default, the `Pulsar Manager` is exposed as a separate `LoadBalancer`. You can open the Pulsar Manager UI using the following command: + + ```bash + + minikube service -n pulsar pulsar-mini-pulsar-manager + + ``` + +2. The Pulsar Manager UI will be open in your browser. You can use the username `pulsar` and password `pulsar` to log into Pulsar Manager. + +3. In Pulsar Manager UI, you can create an environment. + + - Click `New Environment` button in the top-left corner. + - Type `pulsar-mini` for the field `Environment Name` in the popup window. + - Type `http://pulsar-mini-broker:8080` for the field `Service URL` in the popup window. + - Click `Confirm` button in the popup window. + +4. After successfully creating an environment, you are redirected to the `tenants` page of that environment. Then you can create `tenants`, `namespaces` and `topics` using the Pulsar Manager. + +## Step 5: Use Prometheus and Grafana to monitor cluster + +Grafana is an open-source visualization tool, which can be used for visualizing time series data into dashboards. + +1. By default, the Grafana is exposed as a separate `LoadBalancer`. You can open the Grafana UI using the following command: + + ```bash + + minikube service pulsar-mini-grafana -n pulsar + + ``` + +2. The Grafana UI is open in your browser. You can use the username `pulsar` and password `pulsar` to log into the Grafana Dashboard. + +3. You can view dashboards for different components of a Pulsar cluster. diff --git a/site2/website/versioned_docs/version-2.8.x/getting-started-pulsar.md b/site2/website/versioned_docs/version-2.8.x/getting-started-pulsar.md new file mode 100644 index 0000000000000..752590f57b558 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/getting-started-pulsar.md @@ -0,0 +1,72 @@ +--- +id: pulsar-2.0 +title: Pulsar 2.0 +sidebar_label: "Pulsar 2.0" +original_id: pulsar-2.0 +--- + +Pulsar 2.0 is a major new release for Pulsar that brings some bold changes to the platform, including [simplified topic names](#topic-names), the addition of the [Pulsar Functions](functions-overview.md) feature, some terminology changes, and more. + +## New features in Pulsar 2.0 + +Feature | Description +:-------|:----------- +[Pulsar Functions](functions-overview.md) | A lightweight compute option for Pulsar + +## Major changes + +There are a few major changes that you should be aware of, as they may significantly impact your day-to-day usage. + +### Properties versus tenants + +Previously, Pulsar had a concept of properties. A property is essentially the exact same thing as a tenant, so the "property" terminology has been removed in version 2.0. The [`pulsar-admin properties`](reference-pulsar-admin.md#pulsar-admin) command-line interface, for example, has been replaced with the [`pulsar-admin tenants`](reference-pulsar-admin.md#pulsar-admin-tenants) interface. In some cases the properties terminology is still used but is now considered deprecated and will be removed entirely in a future release. + +### Topic names + +Prior to version 2.0, *all* Pulsar topics had the following form: + +```http + +{persistent|non-persistent}://property/cluster/namespace/topic + +``` + +Two important changes have been made in Pulsar 2.0: + +* There is no longer a [cluster component](#no-cluster) +* Properties have been [renamed to tenants](#tenants) +* You can use a [flexible](#flexible-topic-naming) naming system to shorten many topic names +* `/` is not allowed in topic name + +#### No cluster component + +The cluster component has been removed from topic names. Thus, all topic names now have the following form: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +> Existing topics that use the legacy name format will continue to work without any change, and there are no plans to change that. + + +#### Flexible topic naming + +All topic names in Pulsar 2.0 internally have the form shown [above](#no-cluster-component) but you can now use shorthand names in many cases (for the sake of simplicity). The flexible naming system stems from the fact that there is now a default topic type, tenant, and namespace: + +Topic aspect | Default +:------------|:------- +topic type | `persistent` +tenant | `public` +namespace | `default` + +The table below shows some example topic name translations that use implicit defaults: + +Input topic name | Translated topic name +:----------------|:--------------------- +`my-topic` | `persistent://public/default/my-topic` +`my-tenant/my-namespace/my-topic` | `persistent://my-tenant/my-namespace/my-topic` + +> For [non-persistent topics](concepts-messaging.md#non-persistent-topics) you'll need to continue to specify the entire topic name, as the default-based rules for persistent topic names don't apply. Thus you cannot use a shorthand name like `non-persistent://my-topic` and would need to use `non-persistent://public/default/my-topic` instead + diff --git a/site2/website/versioned_docs/version-2.8.x/getting-started-standalone.md b/site2/website/versioned_docs/version-2.8.x/getting-started-standalone.md new file mode 100644 index 0000000000000..fd7a6f24995b3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/getting-started-standalone.md @@ -0,0 +1,271 @@ +--- +id: getting-started-standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: getting-started-standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> #### Pulsar in production? +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of the installation process. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker +(or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), +you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), +you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@95] - Configuration Store cache started +2017-06-01 14:46:29,192 - INFO - [main:AuthenticationService@61] - Authentication is disabled +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@108] - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +09:56:55.566 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.MultiTopicsConsumerImpl - [TopicsConsumerFakeTopicNamee2df9] [first-subscription] Success subscribe new topic my-topic in topics consumer, partitions: 4, allTopicPartitionsNumber: 4 + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +13:09:39.356 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/helm-deploy.md b/site2/website/versioned_docs/version-2.8.x/helm-deploy.md new file mode 100644 index 0000000000000..93709f7091c1e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-deploy.md @@ -0,0 +1,434 @@ +--- +id: helm-deploy +title: Deploy Pulsar cluster using Helm +sidebar_label: "Deployment" +original_id: helm-deploy +--- + +Before running `helm install`, you need to decide how to run Pulsar. +Options can be specified using Helm's `--set option.name=value` command line option. + +## Select configuration options + +In each section, collect the options that are combined to use with the `helm install` command. + +### Kubernetes namespace + +By default, the Pulsar Helm chart is installed to a namespace called `pulsar`. + +```yaml + +namespace: pulsar + +``` + +To install the Pulsar Helm chart into a different Kubernetes namespace, you can include this option in the `helm install` command. + +```bash + +--set namespace= + +``` + +By default, the Pulsar Helm chart doesn't create the namespace. + +```yaml + +namespaceCreate: false + +``` + +To use the Pulsar Helm chart to create the Kubernetes namespace automatically, you can include this option in the `helm install` command. + +```bash + +--set namespaceCreate=true + +``` + +### Persistence + +By default, the Pulsar Helm chart creates Volume Claims with the expectation that a dynamic provisioner creates the underlying Persistent Volumes. + +```yaml + +volumes: + persistence: true + # configure the components to use local persistent volume + # the local provisioner should be installed prior to enable local persistent volume + local_storage: false + +``` + +To use local persistent volumes as the persistent storage for Helm release, you can install the [local storage provisioner](#install-local-storage-provisioner) and include the following option in the `helm install` command. + +```bash + +--set volumes.local_storage=true + +``` + +:::note + +Before installing the production instance of Pulsar, ensure to plan the storage settings to avoid extra storage migration work. Because after initial installation, you must edit Kubernetes objects manually if you want to change storage settings. + +::: + +The Pulsar Helm chart is designed for production use. To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable persistence by including this option in your `helm install` command. + +```bash + +--set volumes.persistence=false + +``` + +### Affinity + +By default, `anti-affinity` is enabled to ensure pods of the same component can run on different nodes. + +```yaml + +affinity: + anti_affinity: true + +``` + +To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable `anti-affinity` by including this option in your `helm install` command. + +```bash + +--set affinity.anti_affinity=false + +``` + +### Components + +The Pulsar Helm chart is designed for production usage. It deploys a production-ready Pulsar cluster, including Pulsar core components and monitoring components. + +You can customize the components to be deployed by turning on/off individual components. + +```yaml + +## Components +## +## Control what components of Apache Pulsar to deploy for the cluster +components: + # zookeeper + zookeeper: true + # bookkeeper + bookkeeper: true + # bookkeeper - autorecovery + autorecovery: true + # broker + broker: true + # functions + functions: true + # proxy + proxy: true + # toolset + toolset: true + # pulsar manager + pulsar_manager: true + +## Monitoring Components +## +## Control what components of the monitoring stack to deploy for the cluster +monitoring: + # monitoring - prometheus + prometheus: true + # monitoring - grafana + grafana: true + +``` + +### Docker images + +The Pulsar Helm chart is designed to enable controlled upgrades. So it can configure independent image versions for components. You can customize the images by setting individual component. + +```yaml + +## Images +## +## Control what images to use for each component +images: + zookeeper: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + bookie: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + autorecovery: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + broker: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + proxy: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + functions: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + prometheus: + repository: prom/prometheus + tag: v1.6.3 + pullPolicy: IfNotPresent + grafana: + repository: streamnative/apache-pulsar-grafana-dashboard-k8s + tag: 0.0.4 + pullPolicy: IfNotPresent + pulsar_manager: + repository: apachepulsar/pulsar-manager + tag: v0.1.0 + pullPolicy: IfNotPresent + hasCommand: false + +``` + +### TLS + +The Pulsar Helm chart can be configured to enable TLS (Transport Layer Security) to protect all the traffic between components. Before enabling TLS, you have to provision TLS certificates for the required components. + +#### Provision TLS certificates using cert-manager + +To use the `cert-manager` to provision the TLS certificates, you have to install the [cert-manager](#install-cert-manager) before installing the Pulsar Helm chart. After successfully installing the cert-manager, you can set `certs.internal_issuer.enabled` to `true`. Therefore, the Pulsar Helm chart can use the `cert-manager` to generate `selfsigning` TLS certificates for the configured components. + +```yaml + +certs: + internal_issuer: + enabled: false + component: internal-cert-issuer + type: selfsigning + +``` + +You can also customize the generated TLS certificates by configuring the fields as the following. + +```yaml + +tls: + # common settings for generating certs + common: + # 90d + duration: 2160h + # 15d + renewBefore: 360h + organization: + - pulsar + keySize: 4096 + keyAlgorithm: rsa + keyEncoding: pkcs8 + +``` + +#### Enable TLS + +After installing the `cert-manager`, you can set `tls.enabled` to `true` to enable TLS encryption for the entire cluster. + +```yaml + +tls: + enabled: false + +``` + +You can also configure whether to enable TLS encryption for individual component. + +```yaml + +tls: + # settings for generating certs for proxy + proxy: + enabled: false + cert_name: tls-proxy + # settings for generating certs for broker + broker: + enabled: false + cert_name: tls-broker + # settings for generating certs for bookies + bookie: + enabled: false + cert_name: tls-bookie + # settings for generating certs for zookeeper + zookeeper: + enabled: false + cert_name: tls-zookeeper + # settings for generating certs for recovery + autorecovery: + cert_name: tls-recovery + # settings for generating certs for toolset + toolset: + cert_name: tls-toolset + +``` + +### Authentication + +By default, authentication is disabled. You can set `auth.authentication.enabled` to `true` to enable authentication. +Currently, the Pulsar Helm chart only supports JWT authentication provider. You can set `auth.authentication.provider` to `jwt` to use the JWT authentication provider. + +```yaml + +# Enable or disable broker authentication and authorization. +auth: + authentication: + enabled: false + provider: "jwt" + jwt: + # Enable JWT authentication + # If the token is generated by a secret key, set the usingSecretKey as true. + # If the token is generated by a private key, set the usingSecretKey as false. + usingSecretKey: false + superUsers: + # broker to broker communication + broker: "broker-admin" + # proxy to broker communication + proxy: "proxy-admin" + # pulsar-admin client to broker/proxy communication + client: "admin" + +``` + +To enable authentication, you can run [prepare helm release](#prepare-the-helm-release) to generate token secret keys and tokens for three super users specified in the `auth.superUsers` field. The generated token keys and super user tokens are uploaded and stored as Kubernetes secrets prefixed with `-token-`. You can use the following command to find those secrets. + +```bash + +kubectl get secrets -n + +``` + +### Authorization + +By default, authorization is disabled. Authorization can be enabled only when authentication is enabled. + +```yaml + +auth: + authorization: + enabled: false + +``` + +To enable authorization, you can include this option in the `helm install` command. + +```bash + +--set auth.authorization.enabled=true + +``` + +### CPU and RAM resource requirements + +By default, the resource requests and the number of replicas for the Pulsar components in the Pulsar Helm chart are adequate for a small production deployment. If you deploy a non-production instance, you can reduce the defaults to fit into a smaller cluster. + +Once you have all of your configuration options collected, you can install dependent charts before installing the Pulsar Helm chart. + +## Install dependent charts + +### Install local storage provisioner + +To use local persistent volumes as the persistent storage, you need to install a storage provisioner for [local persistent volumes](https://kubernetes.io/blog/2019/04/04/kubernetes-1.14-local-persistent-volumes-ga/). + +One of the easiest way to get started is to use the local storage provisioner provided along with the Pulsar Helm chart. + +``` + +helm repo add streamnative https://charts.streamnative.io +helm repo update +helm install pulsar-storage-provisioner streamnative/local-storage-provisioner + +``` + +### Install cert-manager + +The Pulsar Helm chart uses the [cert-manager](https://github.com/jetstack/cert-manager) to provision and manage TLS certificates automatically. To enable TLS encryption for brokers or proxies, you need to install the cert-manager in advance. + +For details about how to install the cert-manager, follow the [official instructions](https://cert-manager.io/docs/installation/kubernetes/#installing-with-helm). + +Alternatively, we provide a bash script [install-cert-manager.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/cert-manager/install-cert-manager.sh) to install a cert-manager release to the namespace `cert-manager`. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/cert-manager/install-cert-manager.sh + +``` + +## Prepare Helm release + +Once you have install all the dependent charts and collected all of your configuration options, you can run [prepare_helm_release.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh) to prepare the Helm release. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/pulsar/prepare_helm_release.sh -n -k + +``` + +The `prepare_helm_release` creates the following resources: + +- A Kubernetes namespace for installing the Pulsar release +- JWT secret keys and tokens for three super users: `broker-admin`, `proxy-admin`, and `admin`. By default, it generates an asymmetric pubic/private key pair. You can choose to generate a symmetric secret key by specifying `--symmetric`. + - `proxy-admin` role is used for proxies to communicate to brokers. + - `broker-admin` role is used for inter-broker communications. + - `admin` role is used by the admin tools. + +## Deploy Pulsar cluster using Helm + +Once you have finished the following three things, you can install a Helm release. + +- Collect all of your configuration options. +- Install dependent charts. +- Prepare the Helm release. + +In this example, we name our Helm release `pulsar`. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm install pulsar apache/pulsar \ + --timeout 10m \ + --set initialize=true \ + --set [your configuration options] + +``` + +:::note + +For the first deployment, add `--set initialize=true` option to initialize bookie and Pulsar cluster metadata. + +::: + +You can also use the `--version ` option if you want to install a specific version of Pulsar Helm chart. + +## Monitor deployment + +A list of installed resources are output once the Pulsar cluster is deployed. This may take 5-10 minutes. + +The status of the deployment can be checked by running the `helm status pulsar` command, which can also be done while the deployment is taking place if you run the command in another terminal. + +## Access Pulsar cluster + +The default values will create a `ClusterIP` for the following resources, which you can use to interact with the cluster. + +- Proxy: You can use the IP address to produce and consume messages to the installed Pulsar cluster. +- Pulsar Manager: You can access the Pulsar Manager UI at `http://:9527`. +- Grafana Dashboard: You can access the Grafana dashboard at `http://:3000`. + +To find the IP addresses of those components, run the following command: + +```bash + +kubectl get service -n + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/helm-install.md b/site2/website/versioned_docs/version-2.8.x/helm-install.md new file mode 100644 index 0000000000000..1f4d5eb69d5dd --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-install.md @@ -0,0 +1,44 @@ +--- +id: helm-install +title: Install Apache Pulsar using Helm +sidebar_label: "Install" +original_id: helm-install +--- + +Install Apache Pulsar on Kubernetes with the official Pulsar Helm chart. + +## Requirements + +To deploy Apache Pulsar on Kubernetes, the followings are required. + +- kubectl 1.14 or higher, compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)) +- Helm v3 (3.0.2 or higher) +- A Kubernetes cluster, version 1.14 or higher + +## Environment setup + +Before deploying Pulsar, you need to prepare your environment. + +### Tools + +Install [`helm`](helm-tools.md) and [`kubectl`](helm-tools.md) on your computer. + +## Cloud cluster preparation + +:::note + +Kubernetes 1.14 or higher is required. + +::: + +To create and connect to the Kubernetes cluster, follow the instructions: + +- [Google Kubernetes Engine](helm-prepare.md#google-kubernetes-engine) + +## Pulsar deployment + +Once the environment is set up and configuration is generated, you can now proceed to the [deployment of Pulsar](helm-deploy.md). + +## Pulsar upgrade + +To upgrade an existing Kubernetes installation, follow the [upgrade documentation](helm-upgrade.md). diff --git a/site2/website/versioned_docs/version-2.8.x/helm-overview.md b/site2/website/versioned_docs/version-2.8.x/helm-overview.md new file mode 100644 index 0000000000000..385d535e319b6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-overview.md @@ -0,0 +1,104 @@ +--- +id: helm-overview +title: Apache Pulsar Helm Chart +sidebar_label: "Overview" +original_id: helm-overview +--- + +This is the official supported Helm chart to install Apache Pulsar on a cloud-native environment. It was enhanced based on StreamNative's [Helm Chart](https://github.com/streamnative/charts). + +## Introduction + +The Apache Pulsar Helm chart is one of the most convenient ways to operate Pulsar on Kubernetes. This Pulsar Helm chart contains all the required components to get started and can scale to large deployments. + +This chart includes all the components for a complete experience, but each part can be configured to be installed separately. + +- Pulsar core components: + - ZooKeeper + - Bookies + - Brokers + - Function workers + - Proxies +- Control Center: + - Pulsar Manager + - Prometheus + - Grafana + +It includes support for: + +- Security + - Automatically provisioned TLS certificates, using [Jetstack](https://www.jetstack.io/)'s [cert-manager](https://cert-manager.io/docs/) + - self-signed + - [Let's Encrypt](https://letsencrypt.org/) + - TLS Encryption + - Proxy + - Broker + - Toolset + - Bookie + - ZooKeeper + - Authentication + - JWT + - Authorization +- Storage + - Non-persistence storage + - Persistence volume + - Local persistent volumes +- Functions + - Kubernetes Runtime + - Process Runtime + - Thread Runtime +- Operations + - Independent image versions for all components, enabling controlled upgrades + +## Pulsar Helm chart quick start + +To get up and run with these charts as fast as possible, in a **non-production** use case, we provide a [quick start guide](getting-started-helm.md) for Proof of Concept (PoC) deployments. + +This guide walks the user through deploying these charts with default values and features, but *does not* meet production ready requirements. To deploy these charts into production under sustained load, follow the complete [Installation Guide](helm-install.md). + +## Troubleshooting + +We have done our best to make these charts as seamless as possible. Occasionally, troubles do go outside of our control. We have collected tips and tricks for troubleshooting common issues. Please check them first before raising an [issue](https://github.com/apache/pulsar/issues/new/choose), and feel free to add to them by raising a [Pull Request](https://github.com/apache/pulsar/compare). + +## Installation + +The Apache Pulsar Helm chart contains all required dependencies. + +If you deploy a PoC for testing, we strongly suggest you follow our [Quick Start Guide](getting-started-helm.md) for your first iteration. + +1. [Preparation](helm-prepare.md) +2. [Deployment](helm-deploy.md) + +## Upgrading + +Once the Pulsar Helm chart is installed, use the `helm upgrade` to complete configuration changes and chart updates. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm get values > pulsar.yaml +helm upgrade apache/pulsar -f pulsar.yaml + +``` + +For more detailed information, see [Upgrading](helm-upgrade.md). + +## Uninstallation + +To uninstall the Pulsar Helm chart, run the following command: + +```bash + +helm delete + +``` + +For the purposes of continuity, these charts have some Kubernetes objects that cannot be removed when performing `helm delete`. +It is recommended to *consciously* remove these items, as they affect re-deployment. + +* PVCs for stateful data: *consciously* remove these items. + - ZooKeeper: This is your metadata. + - BookKeeper: This is your data. + - Prometheus: This is your metrics data, which can be safely removed. +* Secrets: if the secrets are generated by the [prepare release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh), they contain secret keys and tokens. You can use the [cleanup release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/cleanup_helm_release.sh) to remove these secrets and tokens as needed. diff --git a/site2/website/versioned_docs/version-2.8.x/helm-prepare.md b/site2/website/versioned_docs/version-2.8.x/helm-prepare.md new file mode 100644 index 0000000000000..5e9f2f9ef4f68 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-prepare.md @@ -0,0 +1,92 @@ +--- +id: helm-prepare +title: Prepare Kubernetes resources +sidebar_label: "Prepare" +original_id: helm-prepare +--- + +For a fully functional Pulsar cluster, you need a few resources before deploying the Apache Pulsar Helm chart. The following provides instructions to prepare the Kubernetes cluster before deploying the Pulsar Helm chart. + +- [Google Kubernetes Engine](#google-kubernetes-engine) + - [Manual cluster creation](#manual-cluster-creation) + - [Scripted cluster creation](#scripted-cluster-creation) + - [Create cluster with local SSDs](#create-cluster-with-local-ssds) +- [Next Steps](#next-steps) + +## Google Kubernetes Engine + +To get started easier, a script is provided to create the cluster automatically. Alternatively, a cluster can be created manually as well. + +- [Google Kubernetes Engine](#google-kubernetes-engine) + - [Manual cluster creation](#manual-cluster-creation) + - [Scripted cluster creation](#scripted-cluster-creation) + - [Create cluster with local SSDs](#create-cluster-with-local-ssds) +- [Next Steps](#next-steps) + +### Manual cluster creation + +To provision a Kubernetes cluster manually, follow the [GKE instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster). + +Alternatively, you can use the [instructions](#scripted-cluster-creation) below to provision a GKE cluster as needed. + +### Scripted cluster creation + +A [bootstrap script](https://github.com/streamnative/charts/tree/master/scripts/pulsar/gke_bootstrap_script.sh) has been created to automate much of the setup process for users on GCP/GKE. + +The script can: + +1. Create a new GKE cluster. +2. Allow the cluster to modify DNS (Domain Name Server) records. +3. Setup `kubectl`, and connect it to the cluster. + +Google Cloud SDK is a dependency of this script, so ensure it is [set up correctly](helm-tools.md#connect-to-a-gke-cluster) for the script to work. + +The script reads various parameters from environment variables and an argument `up` or `down` for bootstrap and clean-up respectively. + +The following table describes all variables. + +| **Variable** | **Description** | **Default value** | +| ------------ | --------------- | ----------------- | +| PROJECT | ID of your GCP project | No default value. It requires to be set. | +| CLUSTER_NAME | Name of the GKE cluster | `pulsar-dev` | +| CONFDIR | Configuration directory to store Kubernetes configuration | ${HOME}/.config/streamnative | +| INT_NETWORK | IP space to use within this cluster | `default` | +| LOCAL_SSD_COUNT | Number of local SSD counts | 4 | +| MACHINE_TYPE | Type of machine to use for nodes | `n1-standard-4` | +| NUM_NODES | Number of nodes to be created in each of the cluster's zones | 4 | +| PREEMPTIBLE | Create nodes using preemptible VM instances in the new cluster. | false | +| REGION | Compute region for the cluster | `us-east1` | +| USE_LOCAL_SSD | Flag to create a cluster with local SSDs | false | +| ZONE | Compute zone for the cluster | `us-east1-b` | +| ZONE_EXTENSION | The extension (`a`, `b`, `c`) of the zone name of the cluster | `b` | +| EXTRA_CREATE_ARGS | Extra arguments passed to create command | | + +Run the script, by passing in your desired parameters. It can work with the default parameters except for `PROJECT` which is required: + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + +The script can also be used to clean up the created GKE resources. + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh down + +``` + +#### Create cluster with local SSDs + +To install the Pulsar Helm chart using local persistent volumes, you need to create a GKE cluster with local SSDs. You can do so by specifying `USE_LOCAL_SSD` to be `true` in the following command to create a Pulsar cluster with local SSDs. + +``` + +PROJECT= USE_LOCAL_SSD=true LOCAL_SSD_COUNT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + +## Next Steps + +Continue with the [installation of the chart](helm-deploy.md) once you have the cluster up and running. diff --git a/site2/website/versioned_docs/version-2.8.x/helm-tools.md b/site2/website/versioned_docs/version-2.8.x/helm-tools.md new file mode 100644 index 0000000000000..6ba89006913b6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-tools.md @@ -0,0 +1,43 @@ +--- +id: helm-tools +title: Required tools for deploying Pulsar Helm Chart +sidebar_label: "Required Tools" +original_id: helm-tools +--- + +Before deploying Pulsar to your Kubernetes cluster, there are some tools you must have installed locally. + +## kubectl + +kubectl is the tool that talks to the Kubernetes API. kubectl 1.14 or higher is required and it needs to be compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)). + +To Install kubectl locally, follow the [Kubernetes documentation](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl). + +The server version of kubectl cannot be obtained until we connect to a cluster. + +## Helm + +Helm is the package manager for Kubernetes. The Apache Pulsar Helm Chart is tested and supported with Helm v3. + +### Get Helm + +You can get Helm from the project's [releases page](https://github.com/helm/helm/releases), or follow other options under the official documentation of [installing Helm](https://helm.sh/docs/intro/install/). + +### Next steps + +Once kubectl and Helm are configured, you can configure your [Kubernetes cluster](helm-prepare.md). + +## Additional information + +### Templates + +Templating in Helm is done through Golang's [text/template](https://golang.org/pkg/text/template/) and [sprig](https://godoc.org/github.com/Masterminds/sprig). + +For more information about how all the inner workings behave, check these documents: + +- [Functions and Pipelines](https://helm.sh/docs/chart_template_guide/functions_and_pipelines/) +- [Subcharts and Globals](https://helm.sh/docs/chart_template_guide/subcharts_and_globals/) + +### Tips and tricks + +For additional information on developing with Helm, check [tips and tricks section](https://helm.sh/docs/howto/charts_tips_and_tricks/) in the Helm repository. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/helm-upgrade.md b/site2/website/versioned_docs/version-2.8.x/helm-upgrade.md new file mode 100644 index 0000000000000..7d671e6bfb3c1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/helm-upgrade.md @@ -0,0 +1,43 @@ +--- +id: helm-upgrade +title: Upgrade Pulsar Helm release +sidebar_label: "Upgrade" +original_id: helm-upgrade +--- + +Before upgrading your Pulsar installation, you need to check the change log corresponding to the specific release you want to upgrade to and look for any release notes that might pertain to the new Pulsar helm chart version. + +We also recommend that you need to provide all values using the `helm upgrade --set key=value` syntax or the `-f values.yml` instead of using `--reuse-values`, because some of the current values might be deprecated. + +:::note + +You can retrieve your previous `--set` arguments cleanly, with `helm get values `. If you direct this into a file (`helm get values > pulsar.yml`), you can safely pass this file through `-f`, namely `helm upgrade apache/pulsar -f pulsar.yaml`. This safely replaces the behavior of `--reuse-values`. + +::: + +## Steps + +To upgrade Apache Pulsar to a newer version, follow these steps: + +1. Check the change log for the specific version you would like to upgrade to. +2. Go through [deployment documentation](helm-deploy.md) step by step. +3. Extract your previous `--set` arguments with the following command. + + ```bash + + helm get values > pulsar.yaml + + ``` + +4. Decide all the values you need to set. +5. Perform the upgrade, with all `--set` arguments extracted in step 4. + + ```bash + + helm upgrade apache/pulsar \ + --version \ + -f pulsar.yaml \ + --set ... + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-aerospike-sink.md b/site2/website/versioned_docs/version-2.8.x/io-aerospike-sink.md new file mode 100644 index 0000000000000..63d7338a3ba91 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-aerospike-sink.md @@ -0,0 +1,26 @@ +--- +id: io-aerospike-sink +title: Aerospike sink connector +sidebar_label: "Aerospike sink connector" +original_id: io-aerospike-sink +--- + +The Aerospike sink connector pulls messages from Pulsar topics to Aerospike clusters. + +## Configuration + +The configuration of the Aerospike sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `seedHosts` |String| true | No default value| The comma-separated list of one or more Aerospike cluster hosts.

    Each host can be specified as a valid IP address or hostname followed by an optional port number. | +| `keyspace` | String| true |No default value |The Aerospike namespace. | +| `columnName` | String | true| No default value|The Aerospike column name. | +|`userName`|String|false|NULL|The Aerospike username.| +|`password`|String|false|NULL|The Aerospike password.| +| `keySet` | String|false |NULL | The Aerospike set name. | +| `maxConcurrentRequests` |int| false | 100 | The maximum number of concurrent Aerospike transactions that a sink can open. | +| `timeoutMs` | int|false | 100 | This property controls `socketTimeout` and `totalTimeout` for Aerospike transactions. | +| `retries` | int|false | 1 |The maximum number of retries before aborting a write transaction to Aerospike. | diff --git a/site2/website/versioned_docs/version-2.8.x/io-canal-source.md b/site2/website/versioned_docs/version-2.8.x/io-canal-source.md new file mode 100644 index 0000000000000..d1fd43bb0f74e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-canal-source.md @@ -0,0 +1,235 @@ +--- +id: io-canal-source +title: Canal source connector +sidebar_label: "Canal source connector" +original_id: io-canal-source +--- + +The Canal source connector pulls messages from MySQL to Pulsar topics. + +## Configuration + +The configuration of Canal source connector has the following properties. + +### Property + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `username` | true | None | Canal server account (not MySQL).| +| `password` | true | None | Canal server password (not MySQL). | +|`destination`|true|None|Source destination that Canal source connector connects to. +| `singleHostname` | false | None | Canal server address.| +| `singlePort` | false | None | Canal server port.| +| `cluster` | true | false | Whether to enable cluster mode based on Canal server configuration or not.

  • true: **cluster** mode.
    If set to true, it talks to `zkServers` to figure out the actual database host.

  • false: **standalone** mode.
    If set to false, it connects to the database specified by `singleHostname` and `singlePort`.
  • | +| `zkServers` | true | None | Address and port of the Zookeeper that Canal source connector talks to figure out the actual database host.| +| `batchSize` | false | 1000 | Batch size to fetch from Canal. | + +### Example + +Before using the Canal connector, you can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "zkServers": "127.0.0.1:2181", + "batchSize": "5120", + "destination": "example", + "username": "", + "password": "", + "cluster": false, + "singleHostname": "127.0.0.1", + "singlePort": "11111", + } + + ``` + +* YAML + + You can create a YAML file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/resources/canal-mysql-source-config.yaml) below to your YAML file. + + ```yaml + + configs: + zkServers: "127.0.0.1:2181" + batchSize: 5120 + destination: "example" + username: "" + password: "" + cluster: false + singleHostname: "127.0.0.1" + singlePort: 11111 + + ``` + +## Usage + +Here is an example of storing MySQL data using the configuration file as above. + +1. Start a MySQL server. + + ```bash + + $ docker pull mysql:5.7 + $ docker run -d -it --rm --name pulsar-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=canal -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw mysql:5.7 + + ``` + +2. Create a configuration file `mysqld.cnf`. + + ```bash + + [mysqld] + pid-file = /var/run/mysqld/mysqld.pid + socket = /var/run/mysqld/mysqld.sock + datadir = /var/lib/mysql + #log-error = /var/log/mysql/error.log + # By default we only accept connections from localhost + #bind-address = 127.0.0.1 + # Disabling symbolic-links is recommended to prevent assorted security risks + symbolic-links=0 + log-bin=mysql-bin + binlog-format=ROW + server_id=1 + + ``` + +3. Copy the configuration file `mysqld.cnf` to MySQL server. + + ```bash + + $ docker cp mysqld.cnf pulsar-mysql:/etc/mysql/mysql.conf.d/ + + ``` + +4. Restart the MySQL server. + + ```bash + + $ docker restart pulsar-mysql + + ``` + +5. Create a test database in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal -e 'create database test;' + + ``` + +6. Start a Canal server and connect to MySQL server. + + ``` + + $ docker pull canal/canal-server:v1.1.2 + $ docker run -d -it --link pulsar-mysql -e canal.auto.scan=false -e canal.destinations=test -e canal.instance.master.address=pulsar-mysql:3306 -e canal.instance.dbUsername=root -e canal.instance.dbPassword=canal -e canal.instance.connectionCharset=UTF-8 -e canal.instance.tsdb.enable=true -e canal.instance.gtidon=false --name=pulsar-canal-server -p 8000:8000 -p 2222:2222 -p 11111:11111 -p 11112:11112 -m 4096m canal/canal-server:v1.1.2 + + ``` + +7. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:2.3.0 + $ docker run -d -it --link pulsar-canal-server -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:2.3.0 bin/pulsar standalone + + ``` + +8. Modify the configuration file `canal-mysql-source-config.yaml`. + + ```yaml + + configs: + zkServers: "" + batchSize: "5120" + destination: "test" + username: "" + password: "" + cluster: false + singleHostname: "pulsar-canal-server" + singlePort: "11111" + + ``` + +9. Create a consumer file `pulsar-client.py`. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', + subscription_name='my-sub') + + while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +10. Copy the configuration file `canal-mysql-source-config.yaml` and the consumer file `pulsar-client.py` to Pulsar server. + + ```bash + + $ docker cp canal-mysql-source-config.yaml pulsar-standalone:/pulsar/conf/ + $ docker cp pulsar-client.py pulsar-standalone:/pulsar/ + + ``` + +11. Download a Canal connector and start it. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.3.0/connectors/pulsar-io-canal-2.3.0.nar -P connectors + $ ./bin/pulsar-admin source localrun \ + --archive ./connectors/pulsar-io-canal-2.3.0.nar \ + --classname org.apache.pulsar.io.canal.CanalStringSource \ + --tenant public \ + --namespace default \ + --name canal \ + --destination-topic-name my-topic \ + --source-config-file /pulsar/conf/canal-mysql-source-config.yaml \ + --parallelism 1 + + ``` + +12. Consume data from MySQL. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ python pulsar-client.py + + ``` + +13. Open another window to log in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal + + ``` + +14. Create a table, and insert, delete, and update data in MySQL server. + + ```bash + + mysql> use test; + mysql> show tables; + mysql> CREATE TABLE IF NOT EXISTS `test_table`(`test_id` INT UNSIGNED AUTO_INCREMENT,`test_title` VARCHAR(100) NOT NULL, + `test_author` VARCHAR(40) NOT NULL, + `test_date` DATE,PRIMARY KEY ( `test_id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; + mysql> INSERT INTO test_table (test_title, test_author, test_date) VALUES("a", "b", NOW()); + mysql> UPDATE test_table SET test_title='c' WHERE test_title='a'; + mysql> DELETE FROM test_table WHERE test_title='c'; + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-cassandra-sink.md b/site2/website/versioned_docs/version-2.8.x/io-cassandra-sink.md new file mode 100644 index 0000000000000..b27a754f49e18 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-cassandra-sink.md @@ -0,0 +1,57 @@ +--- +id: io-cassandra-sink +title: Cassandra sink connector +sidebar_label: "Cassandra sink connector" +original_id: io-cassandra-sink +--- + +The Cassandra sink connector pulls messages from Pulsar topics to Cassandra clusters. + +## Configuration + +The configuration of the Cassandra sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `roots` | String|true | " " (empty string) | A comma-separated list of Cassandra hosts to connect to.| +| `keyspace` | String|true| " " (empty string)| The key space used for writing pulsar messages.

    **Note: `keyspace` should be created prior to a Cassandra sink.**| +| `keyname` | String|true| " " (empty string)| The key name of the Cassandra column family.

    The column is used for storing Pulsar message keys.

    If a Pulsar message doesn't have any key associated, the message value is used as the key. | +| `columnFamily` | String|true| " " (empty string)| The Cassandra column family name.

    **Note: `columnFamily` should be created prior to a Cassandra sink.**| +| `columnName` | String|true| " " (empty string) | The column name of the Cassandra column family.

    The column is used for storing Pulsar message values. | + +### Example + +Before using the Cassandra sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + + ``` + +* YAML + + ``` + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +## Usage + +For more information about **how to connect Pulsar with Cassandra**, see [here](io-quickstart.md#connect-pulsar-to-apache-cassandra). diff --git a/site2/website/versioned_docs/version-2.8.x/io-cdc-debezium.md b/site2/website/versioned_docs/version-2.8.x/io-cdc-debezium.md new file mode 100644 index 0000000000000..293ccf2b35e8a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-cdc-debezium.md @@ -0,0 +1,543 @@ +--- +id: io-cdc-debezium +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-cdc-debezium +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL for the offset topic used in Debezium. You can use the `bin/pulsar-admin --admin-url http://pulsar:8080 sources localrun --source-config-file configs/pg-pulsar-config.yaml` command to point to the target Pulsar cluster.| +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + + + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-postgres:0.8 + $ docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +* JSON + + ```json + + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.10 + mongodb.hosts: "rs0/mongodb:27017", + mongodb.name: "dbserver1", + mongodb.user: "debezium", + mongodb.password: "dbz", + mongodb.task.id: "1", + database.whitelist: "inventory", + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-cdc.md b/site2/website/versioned_docs/version-2.8.x/io-cdc.md new file mode 100644 index 0000000000000..e6e662884826d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-cdc.md @@ -0,0 +1,26 @@ +--- +id: io-cdc +title: CDC connector +sidebar_label: "CDC connector" +original_id: io-cdc +--- + +CDC source connectors capture log changes of databases (such as MySQL, MongoDB, and PostgreSQL) into Pulsar. + +> CDC source connectors are built on top of [Canal](https://github.com/alibaba/canal) and [Debezium](https://debezium.io/) and store all data into Pulsar cluster in a persistent, replicated, and partitioned way. + +Currently, Pulsar has the following CDC connectors. + +Name|Java Class +|---|--- +[Canal source connector](io-canal-source.md)|[org.apache.pulsar.io.canal.CanalStringSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) +[Debezium source connector](io-cdc-debezium.md)|
  • [org.apache.pulsar.io.debezium.DebeziumSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/core/src/main/java/org/apache/pulsar/io/debezium/DebeziumSource.java)
  • [org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java)
  • [org.apache.pulsar.io.debezium.postgres.DebeziumPostgresSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java)
  • + +For more information about Canal and Debezium, see the information below. + +Subject | Reference +|---|--- +How to use Canal source connector with MySQL|[Canal guide](https://github.com/alibaba/canal/wiki) +How does Canal work | [Canal tutorial](https://github.com/alibaba/canal/wiki) +How to use Debezium source connector with MySQL | [Debezium guide](https://debezium.io/docs/connectors/mysql/) +How does Debezium work | [Debezium tutorial](https://debezium.io/docs/tutorial/) diff --git a/site2/website/versioned_docs/version-2.8.x/io-cli.md b/site2/website/versioned_docs/version-2.8.x/io-cli.md new file mode 100644 index 0000000000000..3d54bb61875e2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-cli.md @@ -0,0 +1,658 @@ +--- +id: io-cli +title: Connector Admin CLI +sidebar_label: "CLI" +original_id: io-cli +--- + +The `pulsar-admin` tool helps you manage Pulsar connectors. + +## `sources` + +An interface for managing Pulsar IO sources (ingress data into Pulsar). + +```bash + +$ pulsar-admin sources subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sources` + +* `reload` + + +### `create` + +Submit a Pulsar IO source connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `update` + +Update a already submitted Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. The `source-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--tenant` | The source's tenant. +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + + +### `delete` + +Delete a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `get` + +Get the information about a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `status` + +Check the current status of a Pulsar Source. + +#### Usage + +```bash + +$ pulsar-admin sources status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `list` + +List all running Pulsar IO source connectors. + +#### Usage + +```bash + +$ pulsar-admin sources list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `stop` + +Stop a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `start` + +Start a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `restart` + +Restart a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources restart options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `localrun` + +Run a Pulsar IO source connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the Source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The source's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--deserialization-classname`|The SerDe classname for the source. +|`--destination-topic-name`|The Pulsar topic to which data is sent. +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +|`--name`|The source’s name.| +|`--namespace`|The source’s namespace.| +|`--parallelism`|The source’s parallelism factor, that is, the number of source instances to run).| +|`--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +|`--source-config`|Source config key/values. +|`--source-config-file`|The path to a YAML config file specifying the source’s configuration. +|`--source-type`|The source's connector provider. +|`--tenant`|The source’s tenant. +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +|`--use-tls`|Use tls connection.
    **Default value: false**. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `available-sources` + +Get the list of Pulsar IO connector sources supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources available-sources + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sources reload + +``` + +## `sinks` + +An interface for managing Pulsar IO sinks (egress data from Pulsar). + +```bash + +$ pulsar-admin sinks subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sinks` + +* `reload` + + +### `create` + +Submit a Pulsar IO sink connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). + +### `update` + +Update a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + +### `delete` + +Delete a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `get` + +Get the information about a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `status` + +Check the current status of a Pulsar sink. + +#### Usage + +```bash + +$ pulsar-admin sinks status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `list` + +List all running Pulsar IO sink connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `stop` + +Stop a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `start` + +Start a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `restart` + +Restart a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks restart options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `localrun` + +Run a Pulsar IO sink connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The sink's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime). +| `--custom-schema-inputs` | The map of input topics to Schema types or class names (as a JSON string). +| `--max-redeliver-count` | Maximum number of times that a message is redelivered before being sent to the dead letter queue. +| `--dead-letter-topic` | Name of the dead letter topic where the failing messages are sent. +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +| `-i`, `--inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name`|The sink’s name.| +|`--namespace`|The sink’s namespace.| +|`--parallelism`|The sink’s parallelism factor, that is, the number of sink instances to run).| +|`--processing-guarantees`|The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--retain-ordering` | Sink consumes and sinks messages in order. +|`--sink-config`|sink config key/values. +|`--sink-config-file`|The path to a YAML config file specifying the sink’s configuration. +|`--sink-type`|The sink's connector provider. +|`--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +|`--tenant`|The sink’s tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--negative-ack-redelivery-delay-ms` | The negatively-acknowledged message redelivery delay in milliseconds. | +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +|`--use-tls`|Use tls connection.
    **Default value: false**. + +### `available-sinks` + +Get the list of Pulsar IO connector sinks supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks available-sinks + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks reload + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-connectors.md b/site2/website/versioned_docs/version-2.8.x/io-connectors.md new file mode 100644 index 0000000000000..8db368e0e7063 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-connectors.md @@ -0,0 +1,232 @@ +--- +id: io-connectors +title: Built-in connector +sidebar_label: "Built-in connector" +original_id: io-connectors +--- + +Pulsar distribution includes a set of common connectors that have been packaged and tested with the rest of Apache Pulsar. These connectors import and export data from some of the most commonly used data systems. + +Using any of these connectors is as easy as writing a simple connector and running the connector locally or submitting the connector to a Pulsar Functions cluster. + +## Source connector + +Pulsar has various source connectors, which are sorted alphabetically as below. + +### Canal + +* [Configuration](io-canal-source.md#configuration) + +* [Example](io-canal-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) + + +### Debezium MySQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mysql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java) + +### Debezium PostgreSQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java) + +### Debezium MongoDB + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mongodb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/java/org/apache/pulsar/io/debezium/mongodb/DebeziumMongoDbSource.java) + +### DynamoDB + +* [Configuration](io-dynamodb-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/dynamodb/src/main/java/org/apache/pulsar/io/dynamodb/DynamoDBSource.java) + +### File + +* [Configuration](io-file-source.md#configuration) + +* [Example](io-file-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/file/src/main/java/org/apache/pulsar/io/file/FileSource.java) + +### Flume + +* [Configuration](io-flume-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/FlumeConnector.java) + +### Twitter firehose + +* [Configuration](io-twitter-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java) + +### Kafka + +* [Configuration](io-kafka-source.md#configuration) + +* [Example](io-kafka-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java) + +### Kinesis + +* [Configuration](io-kinesis-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSource.java) + +### Netty + +* [Configuration](io-netty-source.md#configuration) + +* [Example of TCP](io-netty-source.md#tcp) + +* [Example of HTTP](io-netty-source.md#http) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/netty/src/main/java/org/apache/pulsar/io/netty/NettySource.java) + +### NSQ + +* [Configuration](io-nsq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/nsq/src/main/java/org/apache/pulsar/io/nsq/NSQSource.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java) + +## Sink connector + +Pulsar has various sink connectors, which are sorted alphabetically as below. + +### Aerospike + +* [Configuration](io-aerospike-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java) + +### Cassandra + +* [Configuration](io-cassandra-sink.md#configuration) + +* [Example](io-cassandra-sink.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java) + +### ElasticSearch + +* [Configuration](io-elasticsearch-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/elastic-search/src/main/java/org/apache/pulsar/io/elasticsearch/ElasticSearchSink.java) + +### Flume + +* [Configuration](io-flume-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/sink/StringSink.java) + +### HBase + +* [Configuration](io-hbase-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hbase/src/main/java/org/apache/pulsar/io/hbase/HbaseAbstractConfig.java) + +### HDFS2 + +* [Configuration](io-hdfs2-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs2/src/main/java/org/apache/pulsar/io/hdfs2/AbstractHdfsConnector.java) + +### HDFS3 + +* [Configuration](io-hdfs3-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs3/src/main/java/org/apache/pulsar/io/hdfs3/AbstractHdfsConnector.java) + +### InfluxDB + +* [Configuration](io-influxdb-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/influxdb/src/main/java/org/apache/pulsar/io/influxdb/InfluxDBGenericRecordSink.java) + +### JDBC ClickHouse + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-clickhouse) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/clickhouse/src/main/java/org/apache/pulsar/io/jdbc/ClickHouseJdbcAutoSchemaSink.java) + +### JDBC MariaDB + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-mariadb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/mariadb/src/main/java/org/apache/pulsar/io/jdbc/MariadbJdbcAutoSchemaSink.java) + +### JDBC PostgreSQL + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/postgres/src/main/java/org/apache/pulsar/io/jdbc/PostgresJdbcAutoSchemaSink.java) + +### JDBC SQLite + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-sqlite) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/sqlite/src/main/java/org/apache/pulsar/io/jdbc/SqliteJdbcAutoSchemaSink.java) + +### Kafka + +* [Configuration](io-kafka-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java) + +### Kinesis + +* [Configuration](io-kinesis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSink.java) + +### MongoDB + +* [Configuration](io-mongo-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/mongo/src/main/java/org/apache/pulsar/io/mongodb/MongoSink.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSink.java) + +### Redis + +* [Configuration](io-redis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/redis/src/main/java/org/apache/pulsar/io/redis/RedisAbstractConfig.java) + +### Solr + +* [Configuration](io-solr-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/solr/src/main/java/org/apache/pulsar/io/solr/SolrSinkConfig.java) + diff --git a/site2/website/versioned_docs/version-2.8.x/io-debezium-source.md b/site2/website/versioned_docs/version-2.8.x/io-debezium-source.md new file mode 100644 index 0000000000000..8c3ba0cb20f25 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-debezium-source.md @@ -0,0 +1,621 @@ +--- +id: io-debezium-source +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-debezium-source +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL. | +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `json-with-envelope` | false | false | Present the message only consist of payload. + +### Converter Options + +1. org.apache.kafka.connect.json.JsonConverter + +This config `json-with-envelope` is valid only for the JsonConverter. It's default value is false, the consumer use the schema ` +Schema.KeyValue(Schema.AUTO_CONSUME(), Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, +and the message only consist of payload. + +If the config `json-with-envelope` value is true, the consumer use the schema +`Schema.KeyValue(Schema.BYTES, Schema.BYTES`, the message consist of schema and payload. + +2. org.apache.pulsar.kafka.shade.io.confluent.connect.avro.AvroConverter + +If users select the AvroConverter, then the pulsar consumer should use the schema `Schema.KeyValue(Schema.AUTO_CONSUME(), +Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, and the message consist of payload. + +### MongoDB Configuration +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + + + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "changeme", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "plugin.name": "pgoutput", + "schema.whitelist": "public", + "table.whitelist": "public.users", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for postgres version 10+, official docker image: postgres:<10+> + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "changeme" + database.dbname: "postgres" + database.server.name: "dbserver1" + plugin.name: "pgoutput" + schema.whitelist: "public" + table.whitelist: "public.users" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +Notice that `pgoutput` is a standard plugin of Postgres introduced in version 10 - [see Postgres architecture docu](https://www.postgresql.org/docs/10/logical-replication-architecture.html). You don't need to install anything, just make sure the WAL level is set to `logical` (see docker command below and [Postgres docu](https://www.postgresql.org/docs/current/runtime-config-wal.html)). + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=changeme \ + postgres:13.3 -c wal_level=logical + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "changeme","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "public","table.whitelist": "public.users","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-users_ for the _public.users_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-users" public/default/dbserver1.public.users -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to create sample data in the table _users_. + + ``` + + psql -U postgres -h localhost -p 5432 + Password for user postgres: + + CREATE TABLE users( + id BIGINT GENERATED ALWAYS AS IDENTITY, PRIMARY KEY(id), + hash_firstname TEXT NOT NULL, + hash_lastname TEXT NOT NULL, + gender VARCHAR(6) NOT NULL CHECK (gender IN ('male', 'female')) + ); + + INSERT INTO users(hash_firstname, hash_lastname, gender) + SELECT md5(RANDOM()::TEXT), md5(RANDOM()::TEXT), CASE WHEN RANDOM() < 0.5 THEN 'male' ELSE 'female' END FROM generate_series(1, 100); + + postgres=# select * from users; + + id | hash_firstname | hash_lastname | gender + -------+----------------------------------+----------------------------------+-------- + 1 | 02bf7880eb489edc624ba637f5ab42bd | 3e742c2cc4217d8e3382cc251415b2fb | female + 2 | dd07064326bb9119189032316158f064 | 9c0e938f9eddbd5200ba348965afbc61 | male + 3 | 2c5316fdd9d6595c1cceb70eed12e80c | 8a93d7d8f9d76acfaaa625c82a03ea8b | female + 4 | 3dfa3b4f70d8cd2155567210e5043d2b | 32c156bc28f7f03ab5d28e2588a3dc19 | female + + + postgres=# UPDATE users SET hash_firstname='maxim' WHERE id=1; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"before":null,"after":{"id":1,"hash_firstname":"maxim","hash_lastname":"292113d30a3ccee0e19733dd7f88b258","gender":"male"},"source:{"version":"1.0.0.Final","connector":"postgresql","name":"foobar","ts_ms":1624045862644,"snapshot":"false","db":"postgres","schema":"public","table":"users","txId":595,"lsn":24419784,"xmin":null},"op":"u","ts_ms":1624045862648} + ...many more + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-mongodb:0.10 + mongodb.hosts: "rs0/mongodb:27017", + mongodb.name: "dbserver1", + mongodb.user: "debezium", + mongodb.password: "dbz", + mongodb.task.id: "1", + database.whitelist: "inventory", + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-debug.md b/site2/website/versioned_docs/version-2.8.x/io-debug.md new file mode 100644 index 0000000000000..844e101d00d2a --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-debug.md @@ -0,0 +1,407 @@ +--- +id: io-debug +title: How to debug Pulsar connectors +sidebar_label: "Debug" +original_id: io-debug +--- +This guide explains how to debug connectors in localrun or cluster mode and gives a debugging checklist. +To better demonstrate how to debug Pulsar connectors, here takes a Mongo sink connector as an example. + +**Deploy a Mongo sink environment** +1. Start a Mongo service. + + ```bash + + docker pull mongo:4 + docker run -d -p 27017:27017 --name pulsar-mongo -v $PWD/data:/data/db mongo:4 + + ``` + +2. Create a DB and a collection. + + ```bash + + docker exec -it pulsar-mongo /bin/bash + mongo + > use pulsar + > db.createCollection('messages') + > exit + + ``` + +3. Start Pulsar standalone. + + ```bash + + docker pull apachepulsar/pulsar:2.4.0 + docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --link pulsar-mongo --name pulsar-mongo-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + + ``` + +4. Configure the Mongo sink with the `mongo-sink-config.yaml` file. + + ```bash + + configs: + mongoUri: "mongodb://pulsar-mongo:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + + ```bash + + docker cp mongo-sink-config.yaml pulsar-mongo-standalone:/pulsar/ + + ``` + +5. Download the Mongo sink nar package. + + ```bash + + docker exec -it pulsar-mongo-standalone /bin/bash + curl -O http://apache.01link.hk/pulsar/pulsar-2.4.0/connectors/pulsar-io-mongo-2.4.0.nar + + ``` + +## Debug in localrun mode +Start the Mongo sink in localrun mode using the `localrun` command. +:::tip + +For more information about the `localrun` command, see [`localrun`](reference-connector-admin.md/#localrun-1). + +::: + +```bash + +./bin/pulsar-admin sinks localrun \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public --namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### Use connector log +Use one of the following methods to get a connector log in localrun mode: +* After executing the `localrun` command, the **log is automatically printed on the console**. +* The log is located at: + + ```bash + + logs/functions/tenant/namespace/function-name/function-name-instance-id.log + + ``` + + **Example** + + The path of the Mongo sink connector is: + + ```bash + + logs/functions/public/default/pulsar-mongo-sink/pulsar-mongo-sink-0.log + + ``` + +To clearly explain the log information, here breaks down the large block of information into small blocks and add descriptions for each block. +* This piece of log information shows the storage path of the nar package after decompression. + + ``` + + 08:21:54.132 [main] INFO org.apache.pulsar.common.nar.NarClassLoader - Created class loader with paths: [file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/, file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/, + + ``` + + :::tip + + If `class cannot be found` exception is thrown, check whether the nar file is decompressed in the folder `file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/` or not. + + ::: + +* This piece of log information illustrates the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, resources, and so on, which can be used to **check whether the Mongo sink connector is configured correctly or not**. + + ```bash + + 08:21:55.390 [main] INFO org.apache.pulsar.functions.runtime.ThreadRuntime - ThreadContainer starting function with instance config InstanceConfig(instanceId=0, functionId=853d60a1-0c48-44d5-9a5c-6917386476b2, functionVersion=c2ce1458-b69e-4175-88c0-a0a856a2be8c, functionDetails=tenant: "public" + namespace: "default" + name: "pulsar-mongo-sink" + className: "org.apache.pulsar.functions.api.utils.IdentityFunction" + autoAck: true + parallelism: 1 + source { + typeClassName: "[B" + inputSpecs { + key: "test-mongo" + value { + } + } + cleanupSubscription: true + } + sink { + className: "org.apache.pulsar.io.mongodb.MongoSink" + configs: "{\"mongoUri\":\"mongodb://pulsar-mongo:27017\",\"database\":\"pulsar\",\"collection\":\"messages\",\"batchSize\":2,\"batchTimeMs\":500}" + typeClassName: "[B" + } + resources { + cpu: 1.0 + ram: 1073741824 + disk: 10737418240 + } + componentType: SINK + , maxBufferedTuples=1024, functionAuthenticationSpec=null, port=38459, clusterName=local) + + ``` + +* This piece of log information demonstrates the status of the connections to Mongo and configuration information. + + ```bash + + 08:21:56.231 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.connection - Opened connection [connectionId{localValue:1, serverValue:8}] to pulsar-mongo:27017 + 08:21:56.326 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.cluster - Monitor thread successfully connected to server with description ServerDescription{address=pulsar-mongo:27017, type=STANDALONE, state=CONNECTED, ok=true, version=ServerVersion{versionList=[4, 2, 0]}, minWireVersion=0, maxWireVersion=8, maxDocumentSize=16777216, logicalSessionTimeoutMinutes=30, roundTripTimeNanos=89058800} + + ``` + +* This piece of log information explains the configuration of consumers and clients, including the topic name, subscription name, subscription type, and so on. + + ```bash + + 08:21:56.719 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Starting Pulsar consumer status recorder with config: { + "topicNames" : [ "test-mongo" ], + "topicsPattern" : null, + "subscriptionName" : "public/default/pulsar-mongo-sink", + "subscriptionType" : "Shared", + "receiverQueueSize" : 1000, + "acknowledgementsGroupTimeMicros" : 100000, + "negativeAckRedeliveryDelayMicros" : 60000000, + "maxTotalReceiverQueueSizeAcrossPartitions" : 50000, + "consumerName" : null, + "ackTimeoutMillis" : 0, + "tickDurationMillis" : 1000, + "priorityLevel" : 0, + "cryptoFailureAction" : "CONSUME", + "properties" : { + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink", + "instance_id" : "0" + }, + "readCompacted" : false, + "subscriptionInitialPosition" : "Latest", + "patternAutoDiscoveryPeriod" : 1, + "regexSubscriptionMode" : "PersistentOnly", + "deadLetterPolicy" : null, + "autoUpdatePartitions" : true, + "replicateSubscriptionState" : false, + "resetIncludeHead" : false + } + 08:21:56.726 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Pulsar client config: { + "serviceUrl" : "pulsar://localhost:6650", + "authPluginClassName" : null, + "authParams" : null, + "operationTimeoutMs" : 30000, + "statsIntervalSeconds" : 60, + "numIoThreads" : 1, + "numListenerThreads" : 1, + "connectionsPerBroker" : 1, + "useTcpNoDelay" : true, + "useTls" : false, + "tlsTrustCertsFilePath" : null, + "tlsAllowInsecureConnection" : false, + "tlsHostnameVerificationEnable" : false, + "concurrentLookupRequest" : 5000, + "maxLookupRequest" : 50000, + "maxNumberOfRejectedRequestPerConnection" : 50, + "keepAliveIntervalSeconds" : 30, + "connectionTimeoutMs" : 10000, + "requestTimeoutMs" : 60000, + "defaultBackoffIntervalNanos" : 100000000, + "maxBackoffIntervalNanos" : 30000000000 + } + + ``` + +## Debug in cluster mode +You can use the following methods to debug a connector in cluster mode: +* [Use connector log](#use-connector-log) +* [Use admin CLI](#use-admin-cli) +### Use connector log +In cluster mode, multiple connectors can run on a worker. To find the log path of a specified connector, use the `workerId` to locate the connector log. +### Use admin CLI +Pulsar admin CLI helps you debug Pulsar connectors with the following subcommands: +* [`get`](#get) + +* [`status`](#status) +* [`topics stats`](#topics-stats) + +**Create a Mongo sink** + +```bash + +./bin/pulsar-admin sinks create \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public \ +--namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### `get` +Use the `get` command to get the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, and so on. + +```bash + +./bin/pulsar-admin sinks get --tenant public --namespace default --name pulsar-mongo-sink +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-mongo-sink", + "className": "org.apache.pulsar.io.mongodb.MongoSink", + "inputSpecs": { + "test-mongo": { + "isRegexPattern": false + } + }, + "configs": { + "mongoUri": "mongodb://pulsar-mongo:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": 2.0, + "batchTimeMs": 500.0 + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +:::tip + +For more information about the `get` command, see [`get`](reference-connector-admin.md/#get-1). + +::: + +### `status` +Use the `status` command to get the current status about the Mongo sink connector, such as the number of instance, the number of running instance, instanceId, workerId and so on. + +```bash + +./bin/pulsar-admin sinks status +--tenant public \ +--namespace default \ +--name pulsar-mongo-sink +{ +"numInstances" : 1, +"numRunning" : 1, +"instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-5d202832fd18-8080" + } +} ] +} + +``` + +:::tip + +For more information about the `status` command, see [`status`](reference-connector-admin.md/#stauts-1). +If there are multiple connectors running on a worker, `workerId` can locate the worker on which the specified connector is running. + +::: + +### `topics stats` +Use the `topics stats` command to get the stats for a topic and its connected producer and consumer, such as whether the topic has received messages or not, whether there is a backlog of messages or not, the available permits and other key information. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +```bash + +./bin/pulsar-admin topics stats test-mongo +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "averageMsgSize" : 0.0, + "storageSize" : 1, + "publishers" : [ ], + "subscriptions" : { + "public/default/pulsar-mongo-sink" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "msgBacklog" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Shared", + "msgRateExpired" : 0.0, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "consumerName" : "dffdd", + "availablePermits" : 999, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "metadata" : { + "instance_id" : "0", + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink" + }, + "connectedSince" : "2019-08-26T08:48:07.582Z", + "clientVersion" : "2.4.0", + "address" : "/172.17.0.3:57790" + } ], + "isReplicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled" +} + +``` + +:::tip + +For more information about the `topic stats` command, see [`topic stats`](http://pulsar.apache.org/docs/en/pulsar-admin/#stats-1). + +::: + +## Checklist +This checklist indicates the major areas to check when you debug connectors. It is a reminder of what to look for to ensure a thorough review and an evaluation tool to get the status of connectors. +* Does Pulsar start successfully? + +* Does the external service run normally? + +* Is the nar package complete? + +* Is the connector configuration file correct? + +* In localrun mode, run a connector and check the printed information (connector log) on the console. + +* In cluster mode: + + * Use the `get` command to get the basic information. + + * Use the `status` command to get the current status. + * Use the `topics stats` command to get the stats for a specified topic and its connected producers and consumers. + + * Check the connector log. +* Enter into the external system and verify the result. diff --git a/site2/website/versioned_docs/version-2.8.x/io-develop.md b/site2/website/versioned_docs/version-2.8.x/io-develop.md new file mode 100644 index 0000000000000..d6f4f8261ac82 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-develop.md @@ -0,0 +1,421 @@ +--- +id: io-develop +title: How to develop Pulsar connectors +sidebar_label: "Develop" +original_id: io-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to develop Pulsar connectors to move data +between Pulsar and other systems. + +Pulsar connectors are special [Pulsar Functions](functions-overview.md), so creating +a Pulsar connector is similar to creating a Pulsar function. + +Pulsar connectors come in two types: + +| Type | Description | Example +|---|---|--- +{@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java}|Import data from another system to Pulsar.|[RabbitMQ source connector](io-rabbitmq.md) imports the messages of a RabbitMQ queue to a Pulsar topic. +{@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java}|Export data from Pulsar to another system.|[Kinesis sink connector](io-kinesis.md) exports the messages of a Pulsar topic to a Kinesis stream. + +## Develop + +You can develop Pulsar source connectors and sink connectors. + +### Source + +Developing a source connector is to implement the {@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} +interface, which means you need to implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method and the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sourceContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SourceContext sourceContext) throws Exception; + + ``` + + This method is called when the source connector is initialized. + + In this method, you can retrieve all connector specific settings through the passed-in `config` parameter and initialize all necessary resources. + + For example, a Kafka connector can create a Kafka client in this `open` method. + + Besides, Pulsar runtime also provides a `SourceContext` for the + connector to access runtime resources for tasks like collecting metrics. The implementation can save the `SourceContext` for future use. + +2. Implement the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Reads the next message from source. + * If source does not have any new messages, this call should block. + * @return next message from source. The return result should never be null + * @throws Exception + */ + Record read() throws Exception; + + ``` + + If nothing to return, the implementation should be blocking rather than returning `null`. + + The returned {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should encapsulate the following information, which is needed by Pulsar IO runtime. + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following variables: + + |Variable|Required|Description + |---|---|--- + `TopicName`|No|Pulsar topic name from which the record is originated from. + `Key`|No| Messages can optionally be tagged with keys.

    For more information, see [Routing modes](concepts-messaging.md#routing-modes).| + `Value`|Yes|Actual data of the record. + `EventTime`|No|Event time of the record from the source. + `PartitionId`|No| If the record is originated from a partitioned source, it returns its `PartitionId`.

    `PartitionId` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `RecordSequence`|No|If the record is originated from a sequential source, it returns its `RecordSequence`.

    `RecordSequence` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `Properties` |No| If the record carries user-defined properties, it returns those properties. + `DestinationTopic`|No|Topic to which message should be written. + `Message`|No|A class which carries data sent by users.

    For more information, see [Message.java](https://github.com/apache/pulsar/blob/master/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/Message.java).| + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following methods: + + Method|Description + |---|--- + `ack` |Acknowledge that the record is fully processed. + `fail`|Indicate that the record fails to be processed. + +## Handle schema information + +Pulsar IO automatically handles the schema and provides a strongly typed API based on Java generics. +If you know the schema type that you are producing, you can declare the Java class relative to that type in your sink declaration. + +``` + +public class MySource implements Source { + public Record read() {} +} + +``` + +If you want to implement a source that works with any schema, you can go with `byte[]` (of `ByteBuffer`) and use Schema.AUTO_PRODUCE_BYTES(). + +``` + +public class MySource implements Source { + public Record read() { + + Schema wantedSchema = .... + Record myRecord = new MyRecordImplementation(); + .... + } + class MyRecordImplementation implements Record { + public byte[] getValue() { + return ....encoded byte[]...that represents the value + } + public Schema getSchema() { + return Schema.AUTO_PRODUCE_BYTES(wantedSchema); + } + } +} + +``` + +To handle the `KeyValue` type properly, follow the guidelines for your record implementation: +- It must implement {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/KVRecord.java} interface and implement `getKeySchema`,`getValueSchema`, and `getKeyValueEncodingType` +- It must return a `KeyValue` object as `Record.getValue()` +- It may return null in `Record.getSchema()` + +When Pulsar IO runtime encounters a `KVRecord`, it brings the following changes automatically: +- Set properly the `KeyValueSchema` +- Encode the Message Key and the Message Value according to the `KeyValueEncoding` (SEPARATED or INLINE) + +:::tip + +For more information about **how to create a source connector**, see {@inject: github:KafkaSource:/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java}. + +::: + +### Sink + +Developing a sink connector **is similar to** developing a source connector, that is, you need to implement the {@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} interface, which means implementing the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method and the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sinkContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SinkContext sinkContext) throws Exception; + + ``` + +2. Implement the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Write a message to Sink + * @param record record to write to sink + * @throws Exception + */ + void write(Record record) throws Exception; + + ``` + + During the implementation, you can decide how to write the `Value` and + the `Key` to the actual source, and leverage all the provided information such as + `PartitionId` and `RecordSequence` to achieve different processing guarantees. + + You also need to ack records (if messages are sent successfully) or fail records (if messages fail to send). + +## Handling Schema information + +Pulsar IO handles automatically the Schema and provides a strongly typed API based on Java generics. +If you know the Schema type that you are consuming from you can declare the Java class relative to that type in your Sink declaration. + +``` + +public class MySink implements Sink { + public void write(Record record) {} +} + +``` + +If you want to implement a sink that works with any schema, you can you go with the special GenericObject interface. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + if (genericObject != null) { + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + ... + } + .... + } +} + +``` + +In the case of AVRO, JSON, and Protobuf records (schemaType=AVRO,JSON,PROTOBUF_NATIVE), you can cast the +`genericObject` variable to `GenericRecord` and use `getFields()` and `getField()` API. +You are able to access the native AVRO record using `genericObject.getNativeObject()`. + +In the case of KeyValue type, you can access both the schema for the key and the schema for the value using this code. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + if (type == SchemaType.KEY_VALUE) { + KeyValue keyValue = (KeyValue) nativeObject; + Object key = keyValue.getKey(); + Object value = keyValue.getValue(); + + KeyValueSchema keyValueSchema = (KeyValueSchema) schema; + Schema keySchema = keyValueSchema.getKeySchema(); + Schema valueSchema = keyValueSchema.getValueSchema(); + } + .... + } +} + +``` + +## Test + +Testing connectors can be challenging because Pulsar IO connectors interact with two systems +that may be difficult to mock—Pulsar and the system to which the connector is connecting. + +It is +recommended writing special tests to test the connector functionalities as below +while mocking the external service. + +### Unit test + +You can create unit tests for your connector. + +### Integration test + +Once you have written sufficient unit tests, you can add +separate integration tests to verify end-to-end functionality. + +Pulsar uses [testcontainers](https://www.testcontainers.org/) **for all integration tests**. + +:::tip + +For more information about **how to create integration tests for Pulsar connectors**, see {@inject: github:IntegrationTests:/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io}. + +::: + +## Package + +Once you've developed and tested your connector, you need to package it so that it can be submitted +to a [Pulsar Functions](functions-overview.md) cluster. + +There are two methods to +work with Pulsar Functions' runtime, that is, [NAR](#nar) and [uber JAR](#uber-jar). + +:::note + +If you plan to package and distribute your connector for others to use, you are obligated to + +::: + +license and copyright your own code properly. Remember to add the license and copyright to +all libraries your code uses and to your distribution. +> +> If you use the [NAR](#nar) method, the NAR plugin +automatically creates a `DEPENDENCIES` file in the generated NAR package, including the proper +licensing and copyrights of all libraries of your connector. + +### NAR + +**NAR** stands for NiFi Archive, which is a custom packaging mechanism used by Apache NiFi, to provide +a bit of Java ClassLoader isolation. + +:::tip + +For more information about **how NAR works**, see [here](https://medium.com/hashmapinc/nifi-nar-files-explained-14113f7796fd). + +::: + +Pulsar uses the same mechanism for packaging **all** [built-in connectors](io-connectors.md). + +The easiest approach to package a Pulsar connector is to create a NAR package using [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin). + +Include this [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin) in your maven project for your connector as below. + +```xml + + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + + + +``` + +You must also create a `resources/META-INF/services/pulsar-io.yaml` file with the following contents: + +```yaml + +name: connector name +description: connector description +sourceClass: fully qualified class name (only if source connector) +sinkClass: fully qualified class name (only if sink connector) + +``` + +For Gradle users, there is a [Gradle Nar plugin available on the Gradle Plugin Portal](https://plugins.gradle.org/plugin/io.github.lhotari.gradle-nar-plugin). + +:::tip + +For more information about an **how to use NAR for Pulsar connectors**, see {@inject: github:TwitterFirehose:/pulsar-io/twitter/pom.xml}. + +::: + +### Uber JAR + +An alternative approach is to create an **uber JAR** that contains all of the connector's JAR files +and other resource files. No directory internal structure is necessary. + +You can use [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/examples/includes-excludes.html) to create a uber JAR as below: + +```xml + + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + *:* + + + + + + + +``` + +## Monitor + +Pulsar connectors enable you to move data in and out of Pulsar easily. It is important to ensure that the running connectors are healthy at any time. You can monitor Pulsar connectors that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar connectors expose the metrics that can be collected and used for monitoring the health of **Java** connectors. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java** connectors. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here is an example of how to customize metrics for a Java connector. + +````mdx-code-block + + + +``` + +public class TestMetricSink implements Sink { + + @Override + public void open(Map config, SinkContext sinkContext) throws Exception { + sinkContext.recordMetric("foo", 1); + } + + @Override + public void write(Record record) throws Exception { + + } + + @Override + public void close() throws Exception { + + } + } + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/io-dynamodb-source.md b/site2/website/versioned_docs/version-2.8.x/io-dynamodb-source.md new file mode 100644 index 0000000000000..ce585786eb042 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-dynamodb-source.md @@ -0,0 +1,80 @@ +--- +id: io-dynamodb-source +title: AWS DynamoDB source connector +sidebar_label: "AWS DynamoDB source connector" +original_id: io-dynamodb-source +--- + +The DynamoDB source connector pulls data from DynamoDB table streams and persists data into Pulsar. + +This connector uses the [DynamoDB Streams Kinesis Adapter](https://github.com/awslabs/dynamodb-streams-kinesis-adapter), +which uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual +consuming of messages. The KCL uses DynamoDB to track state for consumers and requires cloudwatch access to log metrics. + + +## Configuration + +The configuration of the DynamoDB source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the KCL application. Must be unique, as it is used to define the table name for the dynamo table used for state tracking.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the KCL checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsEndpoint`|String|false|" " (empty string)|The DynamoDB Streams end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsDynamodbStreamArn`|String|true|" " (empty string)|The DynamoDB stream arn. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the DynamoDB source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsDynamodbStreamArn": "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsDynamodbStreamArn: "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-elasticsearch-sink.md b/site2/website/versioned_docs/version-2.8.x/io-elasticsearch-sink.md new file mode 100644 index 0000000000000..4acedd3dd0788 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-elasticsearch-sink.md @@ -0,0 +1,173 @@ +--- +id: io-elasticsearch-sink +title: ElasticSearch sink connector +sidebar_label: "ElasticSearch sink connector" +original_id: io-elasticsearch-sink +--- + +The ElasticSearch sink connector pulls messages from Pulsar topics and persists the messages to indexes. + +## Configuration + +The configuration of the ElasticSearch sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `elasticSearchUrl` | String| true |" " (empty string)| The URL of elastic search cluster to which the connector connects. | +| `indexName` | String| true |" " (empty string)| The index name to which the connector writes messages. | +| `typeName` | String | false | "_doc" | The type name to which the connector writes messages to.

    The value should be set explicitly to a valid type name other than "_doc" for Elasticsearch version before 6.2, and left to default otherwise. | +| `indexNumberOfShards` | int| false |1| The number of shards of the index. | +| `indexNumberOfReplicas` | int| false |1 | The number of replicas of the index. | +| `username` | String| false |" " (empty string)| The username used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `password` | String| false | " " (empty string)|The password used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | + +## Example + +Before using the ElasticSearch sink connector, you need to create a configuration file through one of the following methods. + +### Configuration + +#### For Elasticsearch After 6.2 + +* JSON + + ```json + + { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "username": "scooby", + "password": "doobie" + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + username: "scooby" + password: "doobie" + + ``` + +#### For Elasticsearch Before 6.2 + +* JSON + + ```json + + { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "typeName": "doc", + "username": "scooby", + "password": "doobie" + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + typeName: "doc" + username: "scooby" + password: "doobie" + + ``` + +### Usage + +1. Start a single node Elasticsearch cluster. + + ```bash + + $ docker run -p 9200:9200 -p 9300:9300 \ + -e "discovery.type=single-node" \ + docker.elastic.co/elasticsearch/elasticsearch:7.5.1 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + + Make sure the NAR file is available at `connectors/pulsar-io-elastic-search-@pulsar:version@.nar`. + +3. Start the Pulsar Elasticsearch connector in local run mode using one of the following methods. + * Use the **JSON** configuration as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config '{"elasticSearchUrl":"http://localhost:9200","indexName": "my_index","username": "scooby","password": "doobie"}' \ + --inputs elasticsearch_test + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config-file elasticsearch-sink.yml \ + --inputs elasticsearch_test + + ``` + +4. Publish records to the topic. + + ```bash + + $ bin/pulsar-client produce elasticsearch_test --messages "{\"a\":1}" + + ``` + +5. Check documents in Elasticsearch. + + * refresh the index + + ```bash + + $ curl -s http://localhost:9200/my_index/_refresh + + ``` + + + * search documents + + ```bash + + $ curl -s http://localhost:9200/my_index/_search + + ``` + + You can see the record that published earlier has been successfully written into Elasticsearch. + + ```json + + {"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"my_index","_type":"_doc","_id":"FSxemm8BLjG_iC0EeTYJ","_score":1.0,"_source":{"a":1}}]}} + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-file-source.md b/site2/website/versioned_docs/version-2.8.x/io-file-source.md new file mode 100644 index 0000000000000..e9d710cce65e8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-file-source.md @@ -0,0 +1,160 @@ +--- +id: io-file-source +title: File source connector +sidebar_label: "File source connector" +original_id: io-file-source +--- + +The File source connector pulls messages from files in directories and persists the messages to Pulsar topics. + +## Configuration + +The configuration of the File source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `inputDirectory` | String|true | No default value|The input directory to pull files. | +| `recurse` | Boolean|false | true | Whether to pull files from subdirectory or not.| +| `keepFile` |Boolean|false | false | If set to true, the file is not deleted after it is processed, which means the file can be picked up continually. | +| `fileFilter` | String|false| [^\\.].* | The file whose name matches the given regular expression is picked up. | +| `pathFilter` | String |false | NULL | If `recurse` is set to true, the subdirectory whose path matches the given regular expression is scanned. | +| `minimumFileAge` | Integer|false | 0 | The minimum age that a file can be processed.

    Any file younger than `minimumFileAge` (according to the last modification date) is ignored. | +| `maximumFileAge` | Long|false |Long.MAX_VALUE | The maximum age that a file can be processed.

    Any file older than `maximumFileAge` (according to last modification date) is ignored. | +| `minimumSize` |Integer| false |1 | The minimum size (in bytes) that a file can be processed. | +| `maximumSize` | Double|false |Double.MAX_VALUE| The maximum size (in bytes) that a file can be processed. | +| `ignoreHiddenFiles` |Boolean| false | true| Whether the hidden files should be ignored or not. | +| `pollingInterval`|Long | false | 10000L | Indicates how long to wait before performing a directory listing. | +| `numWorkers` | Integer | false | 1 | The number of worker threads that process files.

    This allows you to process a larger number of files concurrently.

    However, setting this to a value greater than 1 makes the data from multiple files mixed in the target topic. | + +### Example + +Before using the File source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "inputDirectory": "/Users/david", + "recurse": true, + "keepFile": true, + "fileFilter": "[^\\.].*", + "pathFilter": "*", + "minimumFileAge": 0, + "maximumFileAge": 9999999999, + "minimumSize": 1, + "maximumSize": 5000000, + "ignoreHiddenFiles": true, + "pollingInterval": 5000, + "numWorkers": 1 + } + + ``` + +* YAML + + ```yaml + + configs: + inputDirectory: "/Users/david" + recurse: true + keepFile: true + fileFilter: "[^\\.].*" + pathFilter: "*" + minimumFileAge: 0 + maximumFileAge: 9999999999 + minimumSize: 1 + maximumSize: 5000000 + ignoreHiddenFiles: true + pollingInterval: 5000 + numWorkers: 1 + + ``` + +## Usage + +Here is an example of using the File source connecter. + +1. Pull a Pulsar image. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + ``` + +2. Start Pulsar standalone. + + ```bash + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +3. Create a configuration file _file-connector.yaml_. + + ```yaml + + configs: + inputDirectory: "/opt" + + ``` + +4. Copy the configuration file _file-connector.yaml_ to the container. + + ```bash + + $ docker cp connectors/file-connector.yaml pulsar-standalone:/pulsar/ + + ``` + +5. Download the File source connector. + + ```bash + + $ curl -O https://mirrors.tuna.tsinghua.edu.cn/apache/pulsar/pulsar-{version}/connectors/pulsar-io-file-{version}.nar + + ``` + +6. Start the File source connector. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + + $ ./bin/pulsar-admin sources localrun \ + --archive /pulsar/pulsar-io-file-{version}.nar \ + --name file-test \ + --destination-topic-name pulsar-file-test \ + --source-config-file /pulsar/file-connector.yaml + + ``` + +7. Start a consumer. + + ```bash + + ./bin/pulsar-client consume -s file-test -n 0 pulsar-file-test + + ``` + +8. Write the message to the file _test.txt_. + + ```bash + + echo "hello world!" > /opt/test.txt + + ``` + + The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello world! + + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/io-flume-sink.md b/site2/website/versioned_docs/version-2.8.x/io-flume-sink.md new file mode 100644 index 0000000000000..b2ace53702f8c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-flume-sink.md @@ -0,0 +1,56 @@ +--- +id: io-flume-sink +title: Flume sink connector +sidebar_label: "Flume sink connector" +original_id: io-flume-sink +--- + +The Flume sink connector pulls messages from Pulsar topics to logs. + +## Configuration + +The configuration of the Flume sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume sink connector, you need to create a configuration file through one of the following methods. + +> For more information about the `sink.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/sink.conf). + +* JSON + + ```json + + { + "name": "a1", + "confFile": "sink.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: sink.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-flume-source.md b/site2/website/versioned_docs/version-2.8.x/io-flume-source.md new file mode 100644 index 0000000000000..b7fd7edad8811 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-flume-source.md @@ -0,0 +1,56 @@ +--- +id: io-flume-source +title: Flume source connector +sidebar_label: "Flume source connector" +original_id: io-flume-source +--- + +The Flume source connector pulls messages from logs to Pulsar topics. + +## Configuration + +The configuration of the Flume source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume source connector, you need to create a configuration file through one of the following methods. + +> For more information about the `source.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/source.conf). + +* JSON + + ```json + + { + "name": "a1", + "confFile": "source.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: source.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-hbase-sink.md b/site2/website/versioned_docs/version-2.8.x/io-hbase-sink.md new file mode 100644 index 0000000000000..1737b00fa2680 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-hbase-sink.md @@ -0,0 +1,67 @@ +--- +id: io-hbase-sink +title: HBase sink connector +sidebar_label: "HBase sink connector" +original_id: io-hbase-sink +--- + +The HBase sink connector pulls the messages from Pulsar topics +and persists the messages to HBase tables + +## Configuration + +The configuration of the HBase sink connector has the following properties. + +### Property + +| Name | Type|Default | Required | Description | +|------|---------|----------|-------------|--- +| `hbaseConfigResources` | String|None | false | HBase system configuration `hbase-site.xml` file. | +| `zookeeperQuorum` | String|None | true | HBase system configuration about `hbase.zookeeper.quorum` value. | +| `zookeeperClientPort` | String|2181 | false | HBase system configuration about `hbase.zookeeper.property.clientPort` value. | +| `zookeeperZnodeParent` | String|/hbase | false | HBase system configuration about `zookeeper.znode.parent` value. | +| `tableName` | None |String | true | HBase table, the value is `namespace:tableName`. | +| `rowKeyName` | String|None | true | HBase table rowkey name. | +| `familyName` | String|None | true | HBase table column family name. | +| `qualifierNames` |String| None | true | HBase table column qualifier names. | +| `batchTimeMs` | Long|1000l| false | HBase table operation timeout in milliseconds. | +| `batchSize` | int|200| false | Batch size of updates made to the HBase table. | + +### Example + +Before using the HBase sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hbaseConfigResources": "hbase-site.xml", + "zookeeperQuorum": "localhost", + "zookeeperClientPort": "2181", + "zookeeperZnodeParent": "/hbase", + "tableName": "pulsar_hbase", + "rowKeyName": "rowKey", + "familyName": "info", + "qualifierNames": [ 'name', 'address', 'age'] + } + + ``` + +* YAML + + ```yaml + + configs: + hbaseConfigResources: "hbase-site.xml" + zookeeperQuorum: "localhost" + zookeeperClientPort: "2181" + zookeeperZnodeParent: "/hbase" + tableName: "pulsar_hbase" + rowKeyName: "rowKey" + familyName: "info" + qualifierNames: [ 'name', 'address', 'age'] + + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/io-hdfs2-sink.md b/site2/website/versioned_docs/version-2.8.x/io-hdfs2-sink.md new file mode 100644 index 0000000000000..4a8527154430d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-hdfs2-sink.md @@ -0,0 +1,64 @@ +--- +id: io-hdfs2-sink +title: HDFS2 sink connector +sidebar_label: "HDFS2 sink connector" +original_id: io-hdfs2-sink +--- + +The HDFS2 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS2 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| true, if `compression` is set to `None`. | None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| true | None | The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. +| `subdirectoryPattern` | String | false | None | A subdirectory associated with the created time of the sink.
    The pattern is the formatted pattern of `directory`'s subdirectory.

    See [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html) for pattern's syntax. | + +### Example + +Before using the HDFS2 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "fileExtension": ".log", + "compression": "SNAPPY", + "subdirectoryPattern": "yyyy-MM-dd" + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + fileExtension: ".log" + compression: "SNAPPY" + subdirectoryPattern: "yyyy-MM-dd" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-hdfs3-sink.md b/site2/website/versioned_docs/version-2.8.x/io-hdfs3-sink.md new file mode 100644 index 0000000000000..aec065a25db7f --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-hdfs3-sink.md @@ -0,0 +1,59 @@ +--- +id: io-hdfs3-sink +title: HDFS3 sink connector +sidebar_label: "HDFS3 sink connector" +original_id: io-hdfs3-sink +--- + +The HDFS3 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS3 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| false |None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| false | None| The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. + +### Example + +Before using the HDFS3 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "compression": "SNAPPY" + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + compression: "SNAPPY" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-influxdb-sink.md b/site2/website/versioned_docs/version-2.8.x/io-influxdb-sink.md new file mode 100644 index 0000000000000..9382f8c03121c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-influxdb-sink.md @@ -0,0 +1,119 @@ +--- +id: io-influxdb-sink +title: InfluxDB sink connector +sidebar_label: "InfluxDB sink connector" +original_id: io-influxdb-sink +--- + +The InfluxDB sink connector pulls messages from Pulsar topics +and persists the messages to InfluxDB. + +The InfluxDB sink provides different configurations for InfluxDBv1 and v2 respectively. + +## Configuration + +The configuration of the InfluxDB sink connector has the following properties. + +### Property +#### InfluxDBv2 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `token` | String|true| " " (empty string) |The authentication token used to authenticate to InfluxDB. | +| `organization` | String| true|" " (empty string) | The InfluxDB organization to write to. | +| `bucket` |String| true | " " (empty string)| The InfluxDB bucket to write to. | +| `precision` | String|false| ns | The timestamp precision for writing data to InfluxDB.

    Below are the available options:
  • ns
  • us
  • ms
  • s
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +#### InfluxDBv1 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `username` | String|false| " " (empty string) |The username used to authenticate to InfluxDB. | +| `password` | String| false|" " (empty string) | The password used to authenticate to InfluxDB. | +| `database` |String| true | " " (empty string)| The InfluxDB to which write messages. | +| `consistencyLevel` | String|false|ONE | The consistency level for writing data to InfluxDB.

    Below are the available options:
  • ALL
  • ANY
  • ONE
  • QUORUM
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `retentionPolicy` | String|false| autogen| The retention policy for InfluxDB. | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +### Example +Before using the InfluxDB sink connector, you need to create a configuration file through one of the following methods. +#### InfluxDBv2 +* JSON + + ```json + + { + "influxdbUrl": "http://localhost:9999", + "organization": "example-org", + "bucket": "example-bucket", + "token": "xxxx", + "precision": "ns", + "logLevel": "NONE", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + + ``` + + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:9999" + organization: "example-org" + bucket: "example-bucket" + token: "xxxx" + precision: "ns" + logLevel: "NONE" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + + +#### InfluxDBv1 + +* JSON + + ```json + + { + "influxdbUrl": "http://localhost:8086", + "database": "test_db", + "consistencyLevel": "ONE", + "logLevel": "NONE", + "retentionPolicy": "autogen", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + + ``` + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:8086" + database: "test_db" + consistencyLevel: "ONE" + logLevel: "NONE" + retentionPolicy: "autogen" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-jdbc-sink.md b/site2/website/versioned_docs/version-2.8.x/io-jdbc-sink.md new file mode 100644 index 0000000000000..77dbb61fccd7e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-jdbc-sink.md @@ -0,0 +1,157 @@ +--- +id: io-jdbc-sink +title: JDBC sink connector +sidebar_label: "JDBC sink connector" +original_id: io-jdbc-sink +--- + +The JDBC sink connectors allow pulling messages from Pulsar topics +and persists the messages to ClickHouse, MariaDB, PostgreSQL, and SQLite. + +> Currently, INSERT, DELETE and UPDATE operations are supported. + +## Configuration + +The configuration of all JDBC sink connectors has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `userName` | String|false | " " (empty string) | The username used to connect to the database specified by `jdbcUrl`.

    **Note: `userName` is case-sensitive.**| +| `password` | String|false | " " (empty string)| The password used to connect to the database specified by `jdbcUrl`.

    **Note: `password` is case-sensitive.**| +| `jdbcUrl` | String|true | " " (empty string) | The JDBC URL of the database to which the connector connects. | +| `tableName` | String|true | " " (empty string) | The name of the table to which the connector writes. | +| `nonKey` | String|false | " " (empty string) | A comma-separated list contains the fields used in updating events. | +| `key` | String|false | " " (empty string) | A comma-separated list contains the fields used in `where` condition of updating and deleting events. | +| `timeoutMs` | int| false|500 | The JDBC operation timeout in milliseconds. | +| `batchSize` | int|false | 200 | The batch size of updates made to the database. | + +### Example for ClickHouse + +* JSON + + ```json + + { + "userName": "clickhouse", + "password": "password", + "jdbcUrl": "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink", + "tableName": "pulsar_clickhouse_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-clickhouse-sink" + topicName: "persistent://public/default/jdbc-clickhouse-topic" + sinkType: "jdbc-clickhouse" + configs: + userName: "clickhouse" + password: "password" + jdbcUrl: "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink" + tableName: "pulsar_clickhouse_jdbc_sink" + + ``` + +### Example for MariaDB + +* JSON + + ```json + + { + "userName": "mariadb", + "password": "password", + "jdbcUrl": "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink", + "tableName": "pulsar_mariadb_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-mariadb-sink" + topicName: "persistent://public/default/jdbc-mariadb-topic" + sinkType: "jdbc-mariadb" + configs: + userName: "mariadb" + password: "password" + jdbcUrl: "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink" + tableName: "pulsar_mariadb_jdbc_sink" + + ``` + +### Example for PostgreSQL + +Before using the JDBC PostgreSQL sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "userName": "postgres", + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "tableName": "pulsar_postgres_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-postgres-sink" + topicName: "persistent://public/default/jdbc-postgres-topic" + sinkType: "jdbc-postgres" + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +For more information on **how to use this JDBC sink connector**, see [connect Pulsar to PostgreSQL](io-quickstart.md#connect-pulsar-to-postgresql). + +### Example for SQLite + +* JSON + + ```json + + { + "jdbcUrl": "jdbc:sqlite:db.sqlite", + "tableName": "pulsar_sqlite_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-sqlite-sink" + topicName: "persistent://public/default/jdbc-sqlite-topic" + sinkType: "jdbc-sqlite" + configs: + jdbcUrl: "jdbc:sqlite:db.sqlite" + tableName: "pulsar_sqlite_jdbc_sink" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-kafka-sink.md b/site2/website/versioned_docs/version-2.8.x/io-kafka-sink.md new file mode 100644 index 0000000000000..09dad4ce70bac --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-kafka-sink.md @@ -0,0 +1,72 @@ +--- +id: io-kafka-sink +title: Kafka sink connector +sidebar_label: "Kafka sink connector" +original_id: io-kafka-sink +--- + +The Kafka sink connector pulls messages from Pulsar topics and persists the messages +to Kafka topics. + +This guide explains how to configure and use the Kafka sink connector. + +## Configuration + +The configuration of the Kafka sink connector has the following parameters. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +|`acks`|String|true|" " (empty string) |The number of acknowledgments that the producer requires the leader to receive before a request completes.
    This controls the durability of the sent records. +|`batchsize`|long|false|16384L|The batch size that a Kafka producer attempts to batch records together before sending them to brokers. +|`maxRequestSize`|long|false|1048576L|The maximum size of a Kafka request in bytes. +|`topic`|String|true|" " (empty string) |The Kafka topic which receives messages from Pulsar. +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringSerializer | The serializer class for Kafka producers to serialize keys. +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArraySerializer | The serializer class for Kafka producers to serialize values.

    The serializer is set by a specific implementation of [`KafkaAbstractSink`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java). +|`producerConfigProperties`|Map|false|" " (empty string)|The producer configuration properties to be passed to producers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. + + +### Example + +Before using the Kafka sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "bootstrapServers": "localhost:6667", + "topic": "test", + "acks": "1", + "batchSize": "16384", + "maxRequestSize": "1048576", + "producerConfigProperties": + { + "client.id": "test-pulsar-producer", + "security.protocol": "SASL_PLAINTEXT", + "sasl.mechanism": "GSSAPI", + "sasl.kerberos.service.name": "kafka", + "acks": "all" + } + } + +* YAML + + ``` + +yaml + configs: + bootstrapServers: "localhost:6667" + topic: "test" + acks: "1" + batchSize: "16384" + maxRequestSize: "1048576" + producerConfigProperties: + client.id: "test-pulsar-producer" + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "GSSAPI" + sasl.kerberos.service.name: "kafka" + acks: "all" + ``` diff --git a/site2/website/versioned_docs/version-2.8.x/io-kafka-source.md b/site2/website/versioned_docs/version-2.8.x/io-kafka-source.md new file mode 100644 index 0000000000000..53448699e21b4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-kafka-source.md @@ -0,0 +1,226 @@ +--- +id: io-kafka-source +title: Kafka source connector +sidebar_label: "Kafka source connector" +original_id: io-kafka-source +--- + +The Kafka source connector pulls messages from Kafka topics and persists the messages +to Pulsar topics. + +This guide explains how to configure and use the Kafka source connector. + +## Configuration + +The configuration of the Kafka source connector has the following properties. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +| `groupId` |String| true | " " (empty string) | A unique string that identifies the group of consumer processes to which this consumer belongs. | +| `fetchMinBytes` | long|false | 1 | The minimum byte expected for each fetch response. | +| `autoCommitEnabled` | boolean |false | true | If set to true, the consumer's offset is periodically committed in the background.

    This committed offset is used when the process fails as the position from which a new consumer begins. | +| `autoCommitIntervalMs` | long|false | 5000 | The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if `autoCommitEnabled` is set to true. | +| `heartbeatIntervalMs` | long| false | 3000 | The interval between heartbeats to the consumer when using Kafka's group management facilities.

    **Note: `heartbeatIntervalMs` must be smaller than `sessionTimeoutMs`**.| +| `sessionTimeoutMs` | long|false | 30000 | The timeout used to detect consumer failures when using Kafka's group management facility. | +| `topic` | String|true | " " (empty string)| The Kafka topic which sends messages to Pulsar. | +| `consumerConfigProperties` | Map| false | " " (empty string) | The consumer configuration properties to be passed to consumers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. | +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringDeserializer | The deserializer class for Kafka consumers to deserialize keys.
    The deserializer is set by a specific implementation of [`KafkaAbstractSource`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java). +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArrayDeserializer | The deserializer class for Kafka consumers to deserialize values. +| `autoOffsetReset` | String | false | "earliest" | The default offset reset policy. | + +### Schema Management + +This Kafka source connector applies the schema to the topic depending on the data type that is present on the Kafka topic. +You can detect the data type from the `keyDeserializationClass` and `valueDeserializationClass` configuration parameters. + +If the `valueDeserializationClass` is `org.apache.kafka.common.serialization.StringDeserializer`, you can set Schema.STRING() as schema type on the Pulsar topic. + +If `valueDeserializationClass` is `io.confluent.kafka.serializers.KafkaAvroDeserializer`, Pulsar downloads the AVRO schema from the Confluent Schema Registry® +and sets it properly on the Pulsar topic. + +In this case, you need to set `schema.registry.url` inside of the `consumerConfigProperties` configuration entry +of the source. + +If `keyDeserializationClass` is not `org.apache.kafka.common.serialization.StringDeserializer`, it means +that you do not have a String as key and the Kafka Source uses the KeyValue schema type with the SEPARATED encoding. + +Pulsar supports AVRO format for keys. + +In this case, you can have a Pulsar topic with the following properties: +- Schema: KeyValue schema with SEPARATED encoding +- Key: the content of key of the Kafka message (base64 encoded) +- Value: the content of value of the Kafka message +- KeySchema: the schema detected from `keyDeserializationClass` +- ValueSchema: the schema detected from `valueDeserializationClass` + +Topic compaction and partition routing use the Pulsar key, that contains the Kafka key, and so they are driven by the same value that you have on Kafka. + +When you consume data from Pulsar topics, you can use the `KeyValue` schema. In this way, you can decode the data properly. +If you want to access the raw key, you can use the `Message#getKeyBytes()` API. + +### Example + +Before using the Kafka source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "bootstrapServers": "pulsar-kafka:9092", + "groupId": "test-pulsar-io", + "topic": "my-topic", + "sessionTimeoutMs": "10000", + "autoCommitEnabled": false + } + + ``` + +* YAML + + ```yaml + + configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: false + + ``` + +## Usage + +Here is an example of using the Kafka source connector with the configuration file as shown previously. + +1. Download a Kafka client and a Kafka connector. + + ```bash + + $ wget https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/0.10.2.1/kafka-clients-0.10.2.1.jar + + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.4.0/connectors/pulsar-io-kafka-2.4.0.nar + + ``` + +2. Create a network. + + ```bash + + $ docker network create kafka-pulsar + + ``` + +3. Pull a ZooKeeper image and start ZooKeeper. + + ```bash + + $ docker pull wurstmeister/zookeeper + + $ docker run -d -it -p 2181:2181 --name pulsar-kafka-zookeeper --network kafka-pulsar wurstmeister/zookeeper + + ``` + +4. Pull a Kafka image and start Kafka. + + ```bash + + $ docker pull wurstmeister/kafka:2.11-1.0.2 + + $ docker run -d -it --network kafka-pulsar -p 6667:6667 -p 9092:9092 -e KAFKA_ADVERTISED_HOST_NAME=pulsar-kafka -e KAFKA_ZOOKEEPER_CONNECT=pulsar-kafka-zookeeper:2181 --name pulsar-kafka wurstmeister/kafka:2.11-1.0.2 + + ``` + +5. Pull a Pulsar image and start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:@pulsar:version@ + + $ docker run -d -it --network kafka-pulsar -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-kafka-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + + ``` + +6. Create a producer file _kafka-producer.py_. + + ```python + + from kafka import KafkaProducer + producer = KafkaProducer(bootstrap_servers='pulsar-kafka:9092') + future = producer.send('my-topic', b'hello world') + future.get() + + ``` + +7. Create a consumer file _pulsar-client.py_. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', subscription_name='my-aa') + + while True: + msg = consumer.receive() + print msg + print dir(msg) + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +8. Copy the following files to Pulsar. + + ```bash + + $ docker cp pulsar-io-kafka-@pulsar:version@.nar pulsar-kafka-standalone:/pulsar + $ docker cp kafkaSourceConfig.yaml pulsar-kafka-standalone:/pulsar/conf + $ docker cp pulsar-client.py pulsar-kafka-standalone:/pulsar/ + $ docker cp kafka-producer.py pulsar-kafka-standalone:/pulsar/ + + ``` + +9. Open a new terminal window and start the Kafka source connector in local run mode. + + ```bash + + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ ./bin/pulsar-admin source localrun \ + --archive ./pulsar-io-kafka-@pulsar:version@.nar \ + --classname org.apache.pulsar.io.kafka.KafkaBytesSource \ + --tenant public \ + --namespace default \ + --name kafka \ + --destination-topic-name my-topic \ + --source-config-file ./conf/kafkaSourceConfig.yaml \ + --parallelism 1 + + ``` + +10. Open a new terminal window and run the consumer. + + ```bash + + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ pip install kafka-python + + $ python3 kafka-producer.py + + ``` + + The following information appears on the consumer terminal window. + + ```bash + + Received message: 'hello world' + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-kinesis-sink.md b/site2/website/versioned_docs/version-2.8.x/io-kinesis-sink.md new file mode 100644 index 0000000000000..153587dcfc783 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-kinesis-sink.md @@ -0,0 +1,80 @@ +--- +id: io-kinesis-sink +title: Kinesis sink connector +sidebar_label: "Kinesis sink connector" +original_id: io-kinesis-sink +--- + +The Kinesis sink connector pulls data from Pulsar and persists data into Amazon Kinesis. + +## Configuration + +The configuration of the Kinesis sink connector has the following property. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`messageFormat`|MessageFormat|true|ONLY_RAW_PAYLOAD|Message format in which Kinesis sink converts Pulsar messages and publishes to Kinesis streams.

    Below are the available options:

  • `ONLY_RAW_PAYLOAD`: Kinesis sink directly publishes Pulsar message payload as a message into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_JSON`: Kinesis sink creates a JSON payload with Pulsar message payload, properties and encryptionCtx, and publishes JSON payload into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_FB`: Kinesis sink creates a flatbuffer serialized payload with Pulsar message payload, properties and encryptionCtx, and publishes flatbuffer payload into the configured Kinesis stream.
  • +`retainOrdering`|boolean|false|false|Whether Pulsar connectors to retain ordering when moving messages from Pulsar to Kinesis or not. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    It is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If it is empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`. +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Built-in plugins + +The following are built-in `AwsCredentialProviderPlugin` plugins: + +* `org.apache.pulsar.io.aws.AwsDefaultProviderChainPlugin` + + This plugin takes no configuration, it uses the default AWS provider chain. + + For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +* `org.apache.pulsar.io.aws.STSAssumeRoleProviderPlugin` + + This plugin takes a configuration (via the `awsCredentialPluginParam`) that describes a role to assume when running the KCL. + + This configuration takes the form of a small json document like: + + ```json + + {"roleArn": "arn...", "roleSessionName": "name"} + + ``` + +### Example + +Before using the Kinesis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "messageFormat": "ONLY_RAW_PAYLOAD", + "retainOrdering": "true" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + messageFormat: "ONLY_RAW_PAYLOAD" + retainOrdering: "true" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-kinesis-source.md b/site2/website/versioned_docs/version-2.8.x/io-kinesis-source.md new file mode 100644 index 0000000000000..0d07eefc3703b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-kinesis-source.md @@ -0,0 +1,81 @@ +--- +id: io-kinesis-source +title: Kinesis source connector +sidebar_label: "Kinesis source connector" +original_id: io-kinesis-source +--- + +The Kinesis source connector pulls data from Amazon Kinesis and persists data into Pulsar. + +This connector uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual consuming of messages. The KCL uses DynamoDB to track state for consumers. + +> Note: currently, the Kinesis source connector only supports raw messages. If you use KMS encrypted messages, the encrypted messages are sent to downstream. This connector will support decrypting messages in the future release. + + +## Configuration + +The configuration of the Kinesis source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the Amazon Kinesis application.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the Kinesis stream checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`useEnhancedFanOut`|boolean|false|true|If set to true, it uses Kinesis enhanced fan-out.

    If set to false, it uses polling. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the Kinesis source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-mongo-sink.md b/site2/website/versioned_docs/version-2.8.x/io-mongo-sink.md new file mode 100644 index 0000000000000..30c15a6c28093 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-mongo-sink.md @@ -0,0 +1,56 @@ +--- +id: io-mongo-sink +title: MongoDB sink connector +sidebar_label: "MongoDB sink connector" +original_id: io-mongo-sink +--- + +The MongoDB sink connector pulls messages from Pulsar topics +and persists the messages to collections. + +## Configuration + +The configuration of the MongoDB sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `mongoUri` | String| true| " " (empty string) | The MongoDB URI to which the connector connects.

    For more information, see [connection string URI format](https://docs.mongodb.com/manual/reference/connection-string/). | +| `database` | String| true| " " (empty string)| The database name to which the collection belongs. | +| `collection` | String| true| " " (empty string)| The collection name to which the connector writes messages. | +| `batchSize` | int|false|100 | The batch size of writing messages to collections. | +| `batchTimeMs` |long|false|1000| The batch operation interval in milliseconds. | + + +### Example + +Before using the Mongo sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "mongoUri": "mongodb://localhost:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": "2", + "batchTimeMs": "500" + } + + ``` + +* YAML + + ```yaml + + configs: + mongoUri: "mongodb://localhost:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-netty-source.md b/site2/website/versioned_docs/version-2.8.x/io-netty-source.md new file mode 100644 index 0000000000000..e1ec8d863115b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-netty-source.md @@ -0,0 +1,241 @@ +--- +id: io-netty-source +title: Netty source connector +sidebar_label: "Netty source connector" +original_id: io-netty-source +--- + +The Netty source connector opens a port that accepts incoming data via the configured network protocol +and publish it to user-defined Pulsar topics. + +This connector can be used in a containerized (for example, k8s) deployment. Otherwise, if the connector is running in process or thread mode, the instance may be conflicting on listening to ports. + +## Configuration + +The configuration of the Netty source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `type` |String| true |tcp | The network protocol over which data is transmitted to netty.

    Below are the available options:
  • tcp
  • http
  • udp
  • | +| `host` | String|true | 127.0.0.1 | The host name or address on which the source instance listen. | +| `port` | int|true | 10999 | The port on which the source instance listen. | +| `numberOfThreads` |int| true |1 | The number of threads of Netty TCP server to accept incoming connections and handle the traffic of accepted connections. | + + +### Example + +Before using the Netty source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "type": "tcp", + "host": "127.0.0.1", + "port": "10911", + "numberOfThreads": "1" + } + + ``` + +* YAML + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +## Usage + +The following examples show how to use the Netty source connector with TCP and HTTP. + +### TCP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ apt-get update + + $ apt-get -y install telnet + + $ root@1d19327b2c67:/pulsar# telnet 127.0.0.1 10999 + Trying 127.0.0.1... + Connected to 127.0.0.1. + Escape character is '^]'. + hello + world + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello + + ----- got message ----- + world + + ``` + +### HTTP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "http" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ curl -X POST --data 'hello, world!' http://127.0.0.1:10999/ + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello, world! + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-nsq-source.md b/site2/website/versioned_docs/version-2.8.x/io-nsq-source.md new file mode 100644 index 0000000000000..b61e7e100c22e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-nsq-source.md @@ -0,0 +1,21 @@ +--- +id: io-nsq-source +title: NSQ source connector +sidebar_label: "NSQ source connector" +original_id: io-nsq-source +--- + +The NSQ source connector receives messages from NSQ topics +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the NSQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `lookupds` |String| true | " " (empty string) | A comma-separated list of nsqlookupds to connect to. | +| `topic` | String|true | " " (empty string) | The NSQ topic to transport. | +| `channel` | String |false | pulsar-transport-{$topic} | The channel to consume from on the provided NSQ topic. | \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/io-overview.md b/site2/website/versioned_docs/version-2.8.x/io-overview.md new file mode 100644 index 0000000000000..3db5ee34042d3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-overview.md @@ -0,0 +1,164 @@ +--- +id: io-overview +title: Pulsar connector overview +sidebar_label: "Overview" +original_id: io-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Messaging systems are most powerful when you can easily use them with external systems like databases and other messaging systems. + +**Pulsar IO connectors** enable you to easily create, deploy, and manage connectors that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + + +## Concept + +Pulsar IO connectors come in two types: **source** and **sink**. + +This diagram illustrates the relationship between source, Pulsar, and sink: + +![Pulsar IO diagram](/assets/pulsar-io.png "Pulsar IO connectors (sources and sinks)") + + +### Source + +> Sources **feed data from external systems into Pulsar**. + +Common sources include other messaging systems and firehose-style data pipeline APIs. + +For the complete list of Pulsar built-in source connectors, see [source connector](io-connectors.md#source-connector). + +### Sink + +> Sinks **feed data from Pulsar into external systems**. + +Common sinks include other messaging systems and SQL and NoSQL databases. + +For the complete list of Pulsar built-in sink connectors, see [sink connector](io-connectors.md#sink-connector). + +## Processing guarantee + +Processing guarantees are used to handle errors when writing messages to Pulsar topics. + +> Pulsar connectors and Functions use the **same** processing guarantees as below. + +Delivery semantic | Description +:------------------|:------- +`at-most-once` | Each message sent to a connector is to be **processed once** or **not to be processed**. +`at-least-once` | Each message sent to a connector is to be **processed once** or **more than once**. +`effectively-once` | Each message sent to a connector has **one output associated** with it. + +> Processing guarantees for connectors not just rely on Pulsar guarantee but also **relate to external systems**, that is, **the implementation of source and sink**. + +* Source: Pulsar ensures that writing messages to Pulsar topics respects to the processing guarantees. It is within Pulsar's control. + +* Sink: the processing guarantees rely on the sink implementation. If the sink implementation does not handle retries in an idempotent way, the sink does not respect to the processing guarantees. + +### Set + +When creating a connector, you can set the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +> If `--processing-guarantees` is not specified when creating a connector, the default semantic is `ATLEAST_ONCE`. + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources create \ + --processing-guarantees ATMOST_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources create`, see [here](reference-connector-admin.md#create). + + + + +```bash + +$ bin/pulsar-admin sinks create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks create`, see [here](reference-connector-admin.md#create-1). + + + + +```` + +### Update + +After creating a connector, you can update the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources update \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources update`, see [here](reference-connector-admin.md#update). + + + + +```bash + +$ bin/pulsar-admin sinks update \ + --processing-guarantees ATMOST_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks update`, see [here](reference-connector-admin.md#update-1). + + + + +```` + + +## Work with connector + +You can manage Pulsar connectors (for example, create, update, start, stop, restart, reload, delete and perform other operations on connectors) via the [Connector Admin CLI](reference-connector-admin.md) with [sources](io-cli.md#sources) and [sinks](io-cli.md#sinks) subcommands. + +Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via [Connector Admin CLI](reference-connector-admin.md) or [Functions Admin CLI](functions-cli.md), an instance is started on a worker. For more information, see [Functions worker](functions-worker.md#run-functions-worker-separately). + diff --git a/site2/website/versioned_docs/version-2.8.x/io-quickstart.md b/site2/website/versioned_docs/version-2.8.x/io-quickstart.md new file mode 100644 index 0000000000000..8474c93f51336 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-quickstart.md @@ -0,0 +1,963 @@ +--- +id: io-quickstart +title: How to connect Pulsar to database +sidebar_label: "Get started" +original_id: io-quickstart +--- + +This tutorial provides a hands-on look at how you can move data out of Pulsar without writing a single line of code. + +It is helpful to review the [concepts](io-overview.md) for Pulsar I/O with running the steps in this guide to gain a deeper understanding. + +At the end of this tutorial, you are able to: + +- [Connect Pulsar to Cassandra](#Connect-Pulsar-to-Cassandra) + +- [Connect Pulsar to PostgreSQL](#Connect-Pulsar-to-PostgreSQL) + +:::tip + +* These instructions assume you are running Pulsar in [standalone mode](getting-started-standalone.md). However, all +the commands used in this tutorial can be used in a multi-nodes Pulsar cluster without any changes. +* All the instructions are assumed to run at the root directory of a Pulsar binary distribution. + +::: + +## Install Pulsar and built-in connector + +Before connecting Pulsar to a database, you need to install Pulsar and the desired built-in connector. + +For more information about **how to install a standalone Pulsar and built-in connectors**, see [here](getting-started-standalone.md/#installing-pulsar). + +## Start Pulsar standalone + +1. Start Pulsar locally. + + ```bash + + bin/pulsar standalone + + ``` + + All the components of a Pulsar service are start in order. + + You can curl those pulsar service endpoints to make sure Pulsar service is up running correctly. + +2. Check Pulsar binary protocol port. + + ```bash + + telnet localhost 6650 + + ``` + +3. Check Pulsar Function cluster. + + ```bash + + curl -s http://localhost:8080/admin/v2/worker/cluster + + ``` + + **Example output** + + ```json + + [{"workerId":"c-standalone-fw-localhost-6750","workerHostname":"localhost","port":6750}] + + ``` + +4. Make sure a public tenant and a default namespace exist. + + ```bash + + curl -s http://localhost:8080/admin/v2/namespaces/public + + ``` + + **Example output** + + ```json + + ["public/default","public/functions"] + + ``` + +5. All built-in connectors should be listed as available. + + ```bash + + curl -s http://localhost:8080/admin/v2/functions/connectors + + ``` + + **Example output** + + ```json + + [{"name":"aerospike","description":"Aerospike database sink","sinkClass":"org.apache.pulsar.io.aerospike.AerospikeStringSink"},{"name":"cassandra","description":"Writes data into Cassandra","sinkClass":"org.apache.pulsar.io.cassandra.CassandraStringSink"},{"name":"kafka","description":"Kafka source and sink connector","sourceClass":"org.apache.pulsar.io.kafka.KafkaStringSource","sinkClass":"org.apache.pulsar.io.kafka.KafkaBytesSink"},{"name":"kinesis","description":"Kinesis sink connector","sinkClass":"org.apache.pulsar.io.kinesis.KinesisSink"},{"name":"rabbitmq","description":"RabbitMQ source connector","sourceClass":"org.apache.pulsar.io.rabbitmq.RabbitMQSource"},{"name":"twitter","description":"Ingest data from Twitter firehose","sourceClass":"org.apache.pulsar.io.twitter.TwitterFireHose"}] + + ``` + + If an error occurs when starting Pulsar service, you may see an exception at the terminal running `pulsar/standalone`, + or you can navigate to the `logs` directory under the Pulsar directory to view the logs. + +## Connect Pulsar to Cassandra + +This section demonstrates how to connect Pulsar to Cassandra. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The Cassandra sink connector reads messages from Pulsar topics and writes the messages into Cassandra tables. For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +::: + +### Setup a Cassandra cluster + +This example uses `cassandra` Docker image to start a single-node Cassandra cluster in Docker. + +1. Start a Cassandra cluster. + + ```bash + + docker run -d --rm --name=cassandra -p 9042:9042 cassandra + + ``` + + :::note + + Before moving to the next steps, make sure the Cassandra cluster is running. + + ::: + +2. Make sure the Docker process is running. + + ```bash + + docker ps + + ``` + +3. Check the Cassandra logs to make sure the Cassandra process is running as expected. + + ```bash + + docker logs cassandra + + ``` + +4. Check the status of the Cassandra cluster. + + ```bash + + docker exec cassandra nodetool status + + ``` + + **Example output** + + ``` + + Datacenter: datacenter1 + ======================= + Status=Up/Down + |/ State=Normal/Leaving/Joining/Moving + -- Address Load Tokens Owns (effective) Host ID Rack + UN 172.17.0.2 103.67 KiB 256 100.0% af0e4b2f-84e0-4f0b-bb14-bd5f9070ff26 rack1 + + ``` + +5. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + $ docker exec -ti cassandra cqlsh localhost + Connected to Test Cluster at localhost:9042. + [cqlsh 5.0.1 | Cassandra 3.11.2 | CQL spec 3.4.4 | Native protocol v4] + Use HELP for help. + cqlsh> + + ``` + +6. Create a keyspace `pulsar_test_keyspace`. + + ```bash + + cqlsh> CREATE KEYSPACE pulsar_test_keyspace WITH replication = {'class':'SimpleStrategy', 'replication_factor':1}; + + ``` + +7. Create a table `pulsar_test_table`. + + ```bash + + cqlsh> USE pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> CREATE TABLE pulsar_test_table (key text PRIMARY KEY, col text); + + ``` + +### Configure a Cassandra sink + +Now that we have a Cassandra cluster running locally. + +In this section, you need to configure a Cassandra sink connector. + +To run a Cassandra sink connector, you need to prepare a configuration file including the information that Pulsar connector runtime needs to know. + +For example, how Pulsar connector can find the Cassandra cluster, what is the keyspace and the table that Pulsar connector uses for writing Pulsar messages to, and so on. + +You can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + + ``` + +* YAML + + ```yaml + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +### Create a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) +to create a sink connector and perform other operations on them. + +Run the following command to create a Cassandra sink connector with sink type _cassandra_ and the config file _examples/cassandra-sink.yml_ created previously. + +#### Note +> The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. + +```bash + +bin/pulsar-admin sinks create \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink \ + --sink-type cassandra \ + --sink-config-file examples/cassandra-sink.yml \ + --inputs test_cassandra + +``` + +Once the command is executed, Pulsar creates the sink connector _cassandra-test-sink_. + +This sink connector runs +as a Pulsar Function and writes the messages produced in the topic _test_cassandra_ to the Cassandra table _pulsar_test_table_. + +### Inspect a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) +to monitor a connector and perform other operations on it. + +* Get the information of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "cassandra-test-sink", + "className": "org.apache.pulsar.io.cassandra.CassandraStringSink", + "inputSpecs": { + "test_cassandra": { + "isRegexPattern": false + } + }, + "configs": { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true, + "archive": "builtin://cassandra" + } + + ``` + +* Check the status of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +### Verify a Cassandra sink + +1. Produce some messages to the input topic of the Cassandra sink _test_cassandra_. + + ```bash + + for i in {0..9}; do bin/pulsar-client produce -m "key-$i" -n 1 test_cassandra; done + + ``` + +2. Inspect the status of the Cassandra sink _test_cassandra_. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + You can see 10 messages are processed by the Cassandra sink _test_cassandra_. + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 10, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 10, + "lastReceivedTime" : 1551685489136, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +3. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + docker exec -ti cassandra cqlsh localhost + + ``` + +4. Check the data of the Cassandra table _pulsar_test_table_. + + ```bash + + cqlsh> use pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> select * from pulsar_test_table; + + key | col + --------+-------- + key-5 | key-5 + key-0 | key-0 + key-9 | key-9 + key-2 | key-2 + key-1 | key-1 + key-3 | key-3 + key-6 | key-6 + key-7 | key-7 + key-4 | key-4 + key-8 | key-8 + + ``` + +### Delete a Cassandra Sink + +You can use the [Connector Admin CLI](io-cli.md) +to delete a connector and perform other operations on it. + +```bash + +bin/pulsar-admin sinks delete \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + +``` + +## Connect Pulsar to PostgreSQL + +This section demonstrates how to connect Pulsar to PostgreSQL. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The JDBC sink connector pulls messages from Pulsar topics and persists the messages to ClickHouse, MariaDB, PostgreSQL, or SQlite. + +::: + +>For more information, see [JDBC sink connector](io-jdbc-sink.md). + + +### Setup a PostgreSQL cluster + +This example uses the PostgreSQL 12 docker image to start a single-node PostgreSQL cluster in Docker. + +1. Pull the PostgreSQL 12 image from Docker. + + ```bash + + $ docker pull postgres:12 + + ``` + +2. Start PostgreSQL. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=password \ + -e POSTGRES_USER=postgres \ + postgres:12 + + ``` + + #### Tip + + Flag | Description | This example + ---|---|---| + `-d` | To start a container in detached mode. | / + `-it` | Keep STDIN open even if not attached and allocate a terminal. | / + `--rm` | Remove the container automatically when it exits. | / + `-name` | Assign a name to the container. | This example specifies _pulsar-postgres_ for the container. + `-p` | Publish the port of the container to the host. | This example publishes the port _5432_ of the container to the host. + `-e` | Set environment variables. | This example sets the following variables:
    - The password for the user is _password_.
    - The name for the user is _postgres_. + + :::tip + + For more information about Docker commands, see [Docker CLI](https://docs.docker.com/engine/reference/commandline/run/). + + ::: + +3. Check if PostgreSQL has been started successfully. + + ```bash + + $ docker logs -f pulsar-postgres + + ``` + + PostgreSQL has been started successfully if the following message appears. + + ```text + + 2020-05-11 20:09:24.492 UTC [1] LOG: starting PostgreSQL 12.2 (Debian 12.2-2.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv4 address "0.0.0.0", port 5432 + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv6 address "::", port 5432 + 2020-05-11 20:09:24.499 UTC [1] LOG: listening on Unix socket "/var/run/postgresql/.s.PGSQL.5432" + 2020-05-11 20:09:24.523 UTC [55] LOG: database system was shut down at 2020-05-11 20:09:24 UTC + 2020-05-11 20:09:24.533 UTC [1] LOG: database system is ready to accept connections + + ``` + +4. Access to PostgreSQL. + + ```bash + + $ docker exec -it pulsar-postgres /bin/bash + + ``` + +5. Create a PostgreSQL table _pulsar_postgres_jdbc_sink_. + + ```bash + + $ psql -U postgres postgres + + postgres=# create table if not exists pulsar_postgres_jdbc_sink + ( + id serial PRIMARY KEY, + name VARCHAR(255) NOT NULL + ); + + ``` + +### Configure a JDBC sink + +Now we have a PostgreSQL running locally. + +In this section, you need to configure a JDBC sink connector. + +1. Add a configuration file. + + To run a JDBC sink connector, you need to prepare a YAML configuration file including the information that Pulsar connector runtime needs to know. + + For example, how Pulsar connector can find the PostgreSQL cluster, what is the JDBC URL and the table that Pulsar connector uses for writing messages to. + + Create a _pulsar-postgres-jdbc-sink.yaml_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```yaml + + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +2. Create a schema. + + Create a _avro-schema_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```json + + { + "type": "AVRO", + "schema": "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}", + "properties": {} + } + + ``` + + :::tip + + For more information about AVRO, see [Apache Avro](https://avro.apache.org/docs/1.9.1/). + + ::: + +3. Upload a schema to a topic. + + This example uploads the _avro-schema_ schema to the _pulsar-postgres-jdbc-sink-topic_ topic. + + ```bash + + $ bin/pulsar-admin schemas upload pulsar-postgres-jdbc-sink-topic -f ./connectors/avro-schema + + ``` + +4. Check if the schema has been uploaded successfully. + + ```bash + + $ bin/pulsar-admin schemas get pulsar-postgres-jdbc-sink-topic + + ``` + + The schema has been uploaded successfully if the following message appears. + + ```json + + {"name":"pulsar-postgres-jdbc-sink-topic","schema":"{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}","type":"AVRO","properties":{}} + + ``` + +### Create a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to create a sink connector and perform other operations on it. + +This example creates a sink connector and specifies the desired information. + +```bash + +$ bin/pulsar-admin sinks create \ +--archive ./connectors/pulsar-io-jdbc-postgres-@pulsar:version@.nar \ +--inputs pulsar-postgres-jdbc-sink-topic \ +--name pulsar-postgres-jdbc-sink \ +--sink-config-file ./connectors/pulsar-postgres-jdbc-sink.yaml \ +--parallelism 1 + +``` + +Once the command is executed, Pulsar creates a sink connector _pulsar-postgres-jdbc-sink_. + +This sink connector runs as a Pulsar Function and writes the messages produced in the topic _pulsar-postgres-jdbc-sink-topic_ to the PostgreSQL table _pulsar_postgres_jdbc_sink_. + + #### Tip + + Flag | Description | This example + ---|---|---| + `--archive` | The path to the archive file for the sink. | _pulsar-io-jdbc-postgres-@pulsar:version@.nar_ | + `--inputs` | The input topic(s) of the sink.

    Multiple topics can be specified as a comma-separated list.|| + `--name` | The name of the sink. | _pulsar-postgres-jdbc-sink_ | + `--sink-config-file` | The path to a YAML config file specifying the configuration of the sink. | _pulsar-postgres-jdbc-sink.yaml_ | + `--parallelism` | The parallelism factor of the sink.

    For example, the number of sink instances to run. | _1_ | + +:::tip + +For more information about `pulsar-admin sinks create options`, see [here](io-cli.md#sinks). + +::: + +The sink has been created successfully if the following message appears. + +```bash + +"Created successfully" + +``` + +### Inspect a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to monitor a connector and perform other operations on it. + +* List all running JDBC sink(s). + + ```bash + + $ bin/pulsar-admin sinks list \ + --tenant public \ + --namespace default + + ``` + + :::tip + + For more information about `pulsar-admin sinks list options`, see [here](io-cli.md/#list-1). + + ::: + + The result shows that only the _postgres-jdbc-sink_ sink is running. + + ```json + + [ + "pulsar-postgres-jdbc-sink" + ] + + ``` + +* Get the information of a JDBC sink. + + ```bash + + $ bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks get options`, see [here](io-cli.md/#get-1). + + ::: + + The result shows the information of the sink connector, including tenant, namespace, topic and so on. + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true + } + + ``` + +* Get the status of a JDBC sink + + ```bash + + $ bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks status options`, see [here](io-cli.md/#status-1). + + ::: + + The result shows the current status of sink connector, including the number of instance, running status, worker ID and so on. + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-192.168.2.52-8080" + } + } ] + } + + ``` + +### Stop a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to stop a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks stop \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks stop options`, see [here](io-cli.md/#stop-1). + +::: + +The sink instance has been stopped successfully if the following message disappears. + +```bash + +"Stopped successfully" + +``` + +### Restart a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to restart a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks restart \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks restart options`, see [here](io-cli.md/#restart-1). + +::: + +The sink instance has been started successfully if the following message disappears. + +```bash + +"Started successfully" + +``` + +:::tip + +* Optionally, you can run a standalone sink connector using `pulsar-admin sinks localrun options`. +Note that `pulsar-admin sinks localrun options` **runs a sink connector locally**, while `pulsar-admin sinks start options` **starts a sink connector in a cluster**. +* For more information about `pulsar-admin sinks localrun options`, see [here](io-cli.md#localrun-1). + +::: + +### Update a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to update a connector and perform other operations on it. + +This example updates the parallelism of the _pulsar-postgres-jdbc-sink_ sink connector to 2. + +```bash + +$ bin/pulsar-admin sinks update \ +--name pulsar-postgres-jdbc-sink \ +--parallelism 2 + +``` + +:::tip + +For more information about `pulsar-admin sinks update options`, see [here](io-cli.md/#update-1). + +::: + +The sink connector has been updated successfully if the following message disappears. + +```bash + +"Updated successfully" + +``` + +This example double-checks the information. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the parallelism is 2. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 2, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +### Delete a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to delete a connector and perform other operations on it. + +This example deletes the _pulsar-postgres-jdbc-sink_ sink connector. + +```bash + +$ bin/pulsar-admin sinks delete \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks delete options`, see [here](io-cli.md/#delete-1). + +::: + +The sink connector has been deleted successfully if the following message appears. + +```text + +"Deleted successfully" + +``` + +This example double-checks the status of the sink connector. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the sink connector does not exist. + +```text + +HTTP 404 Not Found + +Reason: Sink pulsar-postgres-jdbc-sink doesn't exist + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-sink.md b/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-sink.md new file mode 100644 index 0000000000000..d7fda99460dc9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-sink.md @@ -0,0 +1,85 @@ +--- +id: io-rabbitmq-sink +title: RabbitMQ sink connector +sidebar_label: "RabbitMQ sink connector" +original_id: io-rabbitmq-sink +--- + +The RabbitMQ sink connector pulls messages from Pulsar topics +and persist the messages to RabbitMQ queues. + + +## Configuration + +The configuration of the RabbitMQ sink connector has the following properties. + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The exchange to publish messages. | +| `exchangeName` | String|true | " " (empty string) | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` |String|true | " " (empty string) |The routing key used to publish messages. | + + +### Example + +Before using the RabbitMQ sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "exchangeName": "test-exchange", + "routingKey": "test-key" + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/", + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + exchangeName: "test-exchange" + routingKey: "test-key" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-source.md b/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-source.md new file mode 100644 index 0000000000000..c2c31cc97d10d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-rabbitmq-source.md @@ -0,0 +1,85 @@ +--- +id: io-rabbitmq-source +title: RabbitMQ source connector +sidebar_label: "RabbitMQ source connector" +original_id: io-rabbitmq-source +--- + +The RabbitMQ source connector receives messages from RabbitMQ clusters +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the RabbitMQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The requested heartbeat timeout in seconds. | +| `prefetchCount` | int|false | 0 | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` | boolean|false | false |Whether the setting should be applied to the entire channel rather than each consumer. | +| `passive` | boolean|false | false | Whether the rabbitmq consumer should create its own queue or bind to an existing one. | + +### Example + +Before using the RabbitMQ source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "prefetchCount": "0", + "prefetchGlobal": "false", + "passive": "false" + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/" + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + prefetchCount: 0 + prefetchGlobal: "false" + passive: "false" + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-redis-sink.md b/site2/website/versioned_docs/version-2.8.x/io-redis-sink.md new file mode 100644 index 0000000000000..793d74a5f2cb3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-redis-sink.md @@ -0,0 +1,74 @@ +--- +id: io-redis-sink +title: Redis sink connector +sidebar_label: "Redis sink connector" +original_id: io-redis-sink +--- + +The Redis sink connector pulls messages from Pulsar topics +and persists the messages to a Redis database. + + + +## Configuration + +The configuration of the Redis sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `redisHosts` |String|true|" " (empty string) | A comma-separated list of Redis hosts to connect to. | +| `redisPassword` |String|false|" " (empty string) | The password used to connect to Redis. | +| `redisDatabase` | int|true|0 | The Redis database to connect to. | +| `clientMode` |String| false|Standalone | The client mode when interacting with Redis cluster.

    Below are the available options:
  • Standalone
  • Cluster
  • | +| `autoReconnect` | boolean|false|true | Whether the Redis client automatically reconnect or not. | +| `requestQueue` | int|false|2147483647 | The maximum number of queued requests to Redis. | +| `tcpNoDelay` |boolean| false| false | Whether to enable TCP with no delay or not. | +| `keepAlive` | boolean|false | false |Whether to enable a keepalive to Redis or not. | +| `connectTimeout` |long| false|10000 | The time to wait before timing out when connecting in milliseconds. | +| `operationTimeout` | long|false|10000 | The time before an operation is marked as timed out in milliseconds . | +| `batchTimeMs` | int|false|1000 | The Redis operation time in milliseconds. | +| `batchSize` | int|false|200 | The batch size of writing to Redis database. | + + +### Example + +Before using the Redis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "redisHosts": "localhost:6379", + "redisPassword": "fake@123", + "redisDatabase": "1", + "clientMode": "Standalone", + "operationTimeout": "2000", + "batchSize": "100", + "batchTimeMs": "1000", + "connectTimeout": "3000" + } + + ``` + +* YAML + + ```yaml + + { + redisHosts: "localhost:6379" + redisPassword: "fake@123" + redisDatabase: 1 + clientMode: "Standalone" + operationTimeout: 2000 + batchSize: 100 + batchTimeMs: 1000 + connectTimeout: 3000 + } + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-solr-sink.md b/site2/website/versioned_docs/version-2.8.x/io-solr-sink.md new file mode 100644 index 0000000000000..df2c3612c38eb --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-solr-sink.md @@ -0,0 +1,65 @@ +--- +id: io-solr-sink +title: Solr sink connector +sidebar_label: "Solr sink connector" +original_id: io-solr-sink +--- + +The Solr sink connector pulls messages from Pulsar topics +and persists the messages to Solr collections. + + + +## Configuration + +The configuration of the Solr sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `solrUrl` | String|true|" " (empty string) |
  • Comma-separated zookeeper hosts with chroot used in the SolrCloud mode.
    **Example**
    `localhost:2181,localhost:2182/chroot`

  • URL to connect to Solr used in standalone mode.
    **Example**
    `localhost:8983/solr`
  • | +| `solrMode` | String|true|SolrCloud| The client mode when interacting with the Solr cluster.

    Below are the available options:
  • Standalone
  • SolrCloud
  • | +| `solrCollection` |String|true| " " (empty string) | Solr collection name to which records need to be written. | +| `solrCommitWithinMs` |int| false|10 | The time within million seconds for Solr updating commits.| +| `username` |String|false| " " (empty string) | The username for basic authentication.

    **Note: `usename` is case-sensitive.** | +| `password` | String|false| " " (empty string) | The password for basic authentication.

    **Note: `password` is case-sensitive.** | + + + +### Example + +Before using the Solr sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "solrUrl": "localhost:2181,localhost:2182/chroot", + "solrMode": "SolrCloud", + "solrCollection": "techproducts", + "solrCommitWithinMs": 100, + "username": "fakeuser", + "password": "fake@123" + } + + ``` + +* YAML + + ```yaml + + { + solrUrl: "localhost:2181,localhost:2182/chroot" + solrMode: "SolrCloud" + solrCollection: "techproducts" + solrCommitWithinMs: 100 + username: "fakeuser" + password: "fake@123" + } + + ``` + diff --git a/site2/website/versioned_docs/version-2.8.x/io-twitter-source.md b/site2/website/versioned_docs/version-2.8.x/io-twitter-source.md new file mode 100644 index 0000000000000..8de3504dd0fef --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-twitter-source.md @@ -0,0 +1,28 @@ +--- +id: io-twitter-source +title: Twitter Firehose source connector +sidebar_label: "Twitter Firehose source connector" +original_id: io-twitter-source +--- + +The Twitter Firehose source connector receives tweets from Twitter Firehose and +writes the tweets to Pulsar topics. + +## Configuration + +The configuration of the Twitter Firehose source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `consumerKey` | String|true | " " (empty string) | The twitter OAuth consumer key.

    For more information, see [Access tokens](https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens). | +| `consumerSecret` | String |true | " " (empty string) | The twitter OAuth consumer secret. | +| `token` | String|true | " " (empty string) | The twitter OAuth token. | +| `tokenSecret` | String|true | " " (empty string) | The twitter OAuth secret. | +| `guestimateTweetTime`|Boolean|false|false|Most firehose events have null createdAt time.

    If `guestimateTweetTime` set to true, the connector estimates the createdTime of each firehose event to be current time. +| `clientName` | String |false | openconnector-twitter-source| The twitter firehose client name. | +| `clientHosts` |String| false | Constants.STREAM_HOST | The twitter firehose hosts to which client connects. | +| `clientBufferSize` | int|false | 50000 | The buffer size for buffering tweets fetched from twitter firehose. | + +> For more information about OAuth credentials, see [Twitter developers portal](https://developer.twitter.com/en.html). diff --git a/site2/website/versioned_docs/version-2.8.x/io-twitter.md b/site2/website/versioned_docs/version-2.8.x/io-twitter.md new file mode 100644 index 0000000000000..3b2f6325453c3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-twitter.md @@ -0,0 +1,7 @@ +--- +id: io-twitter +title: Twitter Firehose Connector +sidebar_label: "Twitter Firehose Connector" +original_id: io-twitter +--- + diff --git a/site2/website/versioned_docs/version-2.8.x/io-use.md b/site2/website/versioned_docs/version-2.8.x/io-use.md new file mode 100644 index 0000000000000..da9ed746c4d37 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/io-use.md @@ -0,0 +1,1787 @@ +--- +id: io-use +title: How to use Pulsar connectors +sidebar_label: "Use" +original_id: io-use +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to use Pulsar connectors. + +## Install a connector + +Pulsar bundles several [builtin connectors](io-connectors.md) used to move data in and out of commonly used systems (such as database and messaging system). Optionally, you can create and use your desired non-builtin connectors. + +:::note + +When using a non-builtin connector, you need to specify the path of a archive file for the connector. + +::: + +To set up a builtin connector, follow +the instructions [here](getting-started-standalone.md#installing-builtin-connectors). + +After the setup, the builtin connector is automatically discovered by Pulsar brokers (or function-workers), so no additional installation steps are required. + +## Configure a connector + +You can configure the following information: + +* [Configure a default storage location for a connector](#configure-a-default-storage-location-for-a-connector) + +* [Configure a connector with a YAML file](#configure-a-connector-with-yaml-file) + +### Configure a default storage location for a connector + +To configure a default folder for builtin connectors, set the `connectorsDirectory` parameter in the `./conf/functions_worker.yml` configuration file. + +**Example** + +Set the `./connectors` folder as the default storage location for builtin connectors. + +``` + +######################## +# Connectors +######################## + +connectorsDirectory: ./connectors + +``` + +### Configure a connector with a YAML file + +To configure a connector, you need to provide a YAML configuration file when creating a connector. + +The YAML configuration file tells Pulsar where to locate connectors and how to connect connectors with Pulsar topics. + +**Example 1** + +Below is a YAML configuration file of a Cassandra sink, which tells Pulsar: + +* Which Cassandra cluster to connect + +* What is the `keyspace` and `columnFamily` to be used in Cassandra for collecting data + +* How to map Pulsar messages into Cassandra table key and columns + +```shell + +tenant: public +namespace: default +name: cassandra-test-sink +... +# cassandra specific config +configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + +``` + +**Example 2** + +Below is a YAML configuration file of a Kafka source. + +```shell + +configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: "false" + +``` + +**Example 3** + +Below is a YAML configuration file of a PostgreSQL JDBC sink. + +```shell + +configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/test_jdbc" + tableName: "test_jdbc" + +``` + +## Get available connectors + +Before starting using connectors, you can perform the following operations: + +* [Reload connectors](#reload) + +* [Get a list of available connectors](#get-available-connectors) + +### `reload` + +If you add or delete a nar file in a connector folder, reload the available builtin connector before using it. + +#### Source + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sources reload + +``` + +For more information, see [`here`](io-cli.md#reload). + +#### Sink + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sinks reload + +``` + +For more information, see [`here`](io-cli.md#reload-1). + +### `available` + +After reloading connectors (optional), you can get a list of available connectors. + +#### Source + +Use the `available-sources` subcommand. + +```shell + +$ pulsar-admin sources available-sources + +``` + +#### Sink + +Use the `available-sinks` subcommand. + +```shell + +$ pulsar-admin sinks available-sinks + +``` + +## Run a connector + +To run a connector, you can perform the following operations: + +* [Create a connector](#create) + +* [Start a connector](#start) + +* [Run a connector locally](#localrun) + +### `create` + +You can create a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Create a source connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sources create options + +``` + +For more information, see [here](io-cli.md#create). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/registerSource?version=@pulsar:version_number@} + + + + +* Create a source connector with a **local file**. + + ```java + + void createSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSource-SourceConfig-java.lang.String-). + +* Create a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sourceConfig` | The source configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Create a sink connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sinks create options + +``` + +For more information, see [here](io-cli.md#create-1). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/registerSink?version=@pulsar:version_number@} + + + + +* Create a sink connector with a **local file**. + + ```java + + void createSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSink-SinkConfig-java.lang.String-). + +* Create a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sinkConfig` | The sink configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +### `start` + +You can start a connector using **Admin CLI** or **REST API**. + +#### Source + +Start a source connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sources start options + +``` + +For more information, see [here](io-cli.md#start). + + + + +* Start **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/start|operation/startSource?version=@pulsar:version_number@} + +* Start a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSource?version=@pulsar:version_number@} + + + + +```` + +#### Sink + +Start a sink connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sinks start options + +``` + +For more information, see [here](io-cli.md#start-1). + + + + +* Start **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/start|operation/startSink?version=@pulsar:version_number@} + +* Start a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSink?version=@pulsar:version_number@} + + + + +```` + +### `localrun` + +You can run a connector locally rather than deploying it on a Pulsar cluster using **Admin CLI**. + +#### Source + +Run a source connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sources localrun options + +``` + +For more information, see [here](io-cli.md#localrun). + + + + +```` + +#### Sink + +Run a sink connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sinks localrun options + +``` + +For more information, see [here](io-cli.md#localrun-1). + + + + +```` + +## Monitor a connector + +To monitor a connector, you can perform the following operations: + +* [Get the information of a connector](#get) + +* [Get the list of all running connectors](#list) + +* [Get the current status of a connector](#status) + +### `get` + +You can get the information of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the information of a source connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sources get options + +``` + +For more information, see [here](io-cli.md#get). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/getSourceInfo?version=@pulsar:version_number@} + + + + +```java + +SourceConfig getSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Example** + +This is a sourceConfig. + +```java + +{ + "tenant": "tenantName", + "namespace": "namespaceName", + "name": "sourceName", + "className": "className", + "topicName": "topicName", + "configs": {}, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +This is a sourceConfig example. + +``` + +{ + "tenant": "public", + "namespace": "default", + "name": "debezium-mysql-source", + "className": "org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource", + "topicName": "debezium-mysql-topic", + "configs": { + "database.user": "debezium", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.port": "3306", + "database.hostname": "localhost", + "database.password": "dbz", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.whitelist": "inventory", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "database.history.pulsar.topic": "history-topic2" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException.NotFoundException` | Cluster doesn't exist +`PulsarAdminException` | Unexpected error + +For more information, see [`getSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the information of a sink connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sinks get options + +``` + +For more information, see [here](io-cli.md#get-1). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/getSinkInfo?version=@pulsar:version_number@} + + + + +```java + +SinkConfig getSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + +``` + +**Example** + +This is a sinkConfig. + +```json + +{ +"tenant": "tenantName", +"namespace": "namespaceName", +"name": "sinkName", +"className": "className", +"inputSpecs": { +"topicName": { + "isRegexPattern": false +} +}, +"configs": {}, +"parallelism": 1, +"processingGuarantees": "ATLEAST_ONCE", +"retainOrdering": false, +"autoAck": true +} + +``` + +This is a sinkConfig example. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +**Parameter description** + +Name| Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +For more information, see [`getSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +### `list` + +You can get the list of all running connectors using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the list of all running source connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sources list options + +``` + +For more information, see [here](io-cli.md#list). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace|operation/listSources?version=@pulsar:version_number@} + + + + +```java + +List listSources(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#listSources-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the list of all running sink connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sinks list options + +``` + +For more information, see [here](io-cli.md#list-1). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace|operation/listSinks?version=@pulsar:version_number@} + + + + +```java + +List listSinks(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#listSinks-java.lang.String-java.lang.String-). + + + + +```` + +### `status` + +You can get the current status of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the current status of a source connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sources status options + +``` + +For more information, see [here](io-cli.md#status). + + + + +* Get the current status of **all** source connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/status|operation/getSourceStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** source connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSourceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** source connectors. + + ```java + + SourceStatus getSourceStatus(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SourceStatus.SourceInstanceStatus.SourceInstanceStatusData getSourceStatus(String tenant, + String namespace, + String source, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Source instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSourceStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Get the current status of a Pulsar sink connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sinks status options + +``` + +For more information, see [here](io-cli.md#status-1). + + + + +* Get the current status of **all** sink connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName/status|operation/getSinkStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** sink connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSinkInstanceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** sink connectors. + + ```java + + SinkStatus getSinkStatus(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SinkStatus.SinkInstanceStatus.SinkInstanceStatusData getSinkStatus(String tenant, + String namespace, + String sink, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Sink instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatusWithInstanceID`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Update a connector + +### `update` + +You can update a running connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Update a running Pulsar source connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sources update options + +``` + +For more information, see [here](io-cli.md#update). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/updateSource?version=@pulsar:version_number@} + + + + +* Update a running source connector with a **local file**. + + ```java + + void updateSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSource-SourceConfig-java.lang.String-). + +* Update a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sourceConfig` | The source configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + +For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Update a running Pulsar sink connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sinks update options + +``` + +For more information, see [here](io-cli.md#update-1). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/updateSink?version=@pulsar:version_number@} + + + + +* Update a running sink connector with a **local file**. + + ```java + + void updateSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSink-SinkConfig-java.lang.String-). + +* Update a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sinkConfig` | The sink configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + |`PulsarAdminException.NotFoundException` | Cluster doesn't exist + |`PulsarAdminException` | Unexpected error + +For more information, see [`updateSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +## Stop a connector + +### `stop` + +You can stop a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Stop a source connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sources stop options + +``` + +For more information, see [here](io-cli.md#stop). + + + + +* Stop **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/stopSource?version=@pulsar:version_number@} + +* Stop a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId|operation/stopSource?version=@pulsar:version_number@} + + + + +* Stop **all** source connectors. + + ```java + + void stopSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** source connector. + + ```java + + void stopSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Stop a sink connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sinks stop options + +``` + +For more information, see [here](io-cli.md#stop-1). + + + + +* Stop **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName/stop|operation/stopSink?version=@pulsar:version_number@} + +* Stop a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkeName/:instanceId/stop|operation/stopSink?version=@pulsar:version_number@} + + + + +* Stop **all** sink connectors. + + ```java + + void stopSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** sink connector. + + ```java + + void stopSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Restart a connector + +### `restart` + +You can restart a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Restart a source connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sources restart options + +``` + +For more information, see [here](io-cli.md#restart). + + + + +* Restart **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart **all** source connectors. + + ```java + + void restartSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** source connector. + + ```java + + void restartSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Restart a sink connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sinks restart options + +``` + +For more information, see [here](io-cli.md#restart-1). + + + + +* Restart **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart all Pulsar sink connectors. + + ```java + + void restartSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Sink name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** sink connector. + + ```java + + void restartSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Sink instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Delete a connector + +### `delete` + +You can delete a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Delete a source connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sources delete options + +``` + +For more information, see [here](io-cli.md#delete). + + + + +Delete al Pulsar source connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/deregisterSource?version=@pulsar:version_number@} + + + + +Delete a source connector. + +```java + +void deleteSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`source` | Source name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#deleteSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Delete a sink connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sinks delete options + +``` + +For more information, see [here](io-cli.md#delete-1). + + + + +Delete a sink connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/deregisterSink?version=@pulsar:version_number@} + + + + +Delete a Pulsar sink connector. + +```java + +void deleteSink(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#deleteSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.8.x/kubernetes-helm.md b/site2/website/versioned_docs/version-2.8.x/kubernetes-helm.md new file mode 100644 index 0000000000000..ea92a0968cd7d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/kubernetes-helm.md @@ -0,0 +1,441 @@ +--- +id: kubernetes-helm +title: Get started in Kubernetes +sidebar_label: "Run Pulsar in Kubernetes" +original_id: kubernetes-helm +--- + +This section guides you through every step of installing and running Apache Pulsar with Helm on Kubernetes quickly, including the following sections: + +- Install the Apache Pulsar on Kubernetes using Helm +- Start and stop Apache Pulsar +- Create topics using `pulsar-admin` +- Produce and consume messages using Pulsar clients +- Monitor Apache Pulsar status with Prometheus and Grafana + +For deploying a Pulsar cluster for production usage, read the documentation on [how to configure and install a Pulsar Helm chart](helm-deploy.md). + +## Prerequisite + +- Kubernetes server 1.14.0+ +- kubectl 1.14.0+ +- Helm 3.0+ + +:::tip + +For the following steps, step 2 and step 3 are for **developers** and step 4 and step 5 are for **administrators**. + +::: + +## Step 0: Prepare a Kubernetes cluster + +Before installing a Pulsar Helm chart, you have to create a Kubernetes cluster. You can follow [the instructions](helm-prepare.md) to prepare a Kubernetes cluster. + +We use [Minikube](https://minikube.sigs.k8s.io/docs/start/) in this quick start guide. To prepare a Kubernetes cluster, follow these steps: + +1. Create a Kubernetes cluster on Minikube. + + ```bash + + minikube start --memory=8192 --cpus=4 --kubernetes-version= + + ``` + + The `` can be any [Kubernetes version supported by your Minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/), such as `v1.16.1`. + +2. Set `kubectl` to use Minikube. + + ```bash + + kubectl config use-context minikube + + ``` + +3. To use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with the local Kubernetes cluster on Minikube, enter the command below: + + ```bash + + minikube dashboard + + ``` + + The command automatically triggers opening a webpage in your browser. + +## Step 1: Install Pulsar Helm chart + +1. Add Pulsar charts repo. + + ```bash + + helm repo add apache https://pulsar.apache.org/charts + + ``` + + ```bash + + helm repo update + + ``` + +2. Clone the Pulsar Helm chart repository. + + ```bash + + git clone https://github.com/apache/pulsar-helm-chart + cd pulsar-helm-chart + + ``` + +3. Run the script `prepare_helm_release.sh` to create secrets required for installing the Apache Pulsar Helm chart. The username `pulsar` and password `pulsar` are used for logging into the Grafana dashboard and Pulsar Manager. + + ```bash + + ./scripts/pulsar/prepare_helm_release.sh \ + -n pulsar \ + -k pulsar-mini \ + -c + + ``` + +4. Use the Pulsar Helm chart to install a Pulsar cluster to Kubernetes. + + :::note + + You need to specify `--set initialize=true` when installing Pulsar the first time. This command installs and starts Apache Pulsar. + + ::: + + ```bash + + helm install \ + --values examples/values-minikube.yaml \ + --set initialize=true \ + --namespace pulsar \ + pulsar-mini apache/pulsar + + ``` + +5. Check the status of all pods. + + ```bash + + kubectl get pods -n pulsar + + ``` + + If all pods start up successfully, you can see that the `STATUS` is changed to `Running` or `Completed`. + + **Output** + + ```bash + + NAME READY STATUS RESTARTS AGE + pulsar-mini-bookie-0 1/1 Running 0 9m27s + pulsar-mini-bookie-init-5gphs 0/1 Completed 0 9m27s + pulsar-mini-broker-0 1/1 Running 0 9m27s + pulsar-mini-grafana-6b7bcc64c7-4tkxd 1/1 Running 0 9m27s + pulsar-mini-prometheus-5fcf5dd84c-w8mgz 1/1 Running 0 9m27s + pulsar-mini-proxy-0 1/1 Running 0 9m27s + pulsar-mini-pulsar-init-t7cqt 0/1 Completed 0 9m27s + pulsar-mini-pulsar-manager-9bcbb4d9f-htpcs 1/1 Running 0 9m27s + pulsar-mini-toolset-0 1/1 Running 0 9m27s + pulsar-mini-zookeeper-0 1/1 Running 0 9m27s + + ``` + +6. Check the status of all services in the namespace `pulsar`. + + ```bash + + kubectl get services -n pulsar + + ``` + + **Output** + + ```bash + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + pulsar-mini-bookie ClusterIP None 3181/TCP,8000/TCP 11m + pulsar-mini-broker ClusterIP None 8080/TCP,6650/TCP 11m + pulsar-mini-grafana LoadBalancer 10.106.141.246 3000:31905/TCP 11m + pulsar-mini-prometheus ClusterIP None 9090/TCP 11m + pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 11m + pulsar-mini-pulsar-manager LoadBalancer 10.103.192.175 9527:30190/TCP 11m + pulsar-mini-toolset ClusterIP None 11m + pulsar-mini-zookeeper ClusterIP None 2888/TCP,3888/TCP,2181/TCP 11m + + ``` + +## Step 2: Use pulsar-admin to create Pulsar tenants/namespaces/topics + +`pulsar-admin` is the CLI (command-Line Interface) tool for Pulsar. In this step, you can use `pulsar-admin` to create resources, including tenants, namespaces, and topics. + +1. Enter the `toolset` container. + + ```bash + + kubectl exec -it -n pulsar pulsar-mini-toolset-0 -- /bin/bash + + ``` + +2. In the `toolset` container, create a tenant named `apache`. + + ```bash + + bin/pulsar-admin tenants create apache + + ``` + + Then you can list the tenants to see if the tenant is created successfully. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + You should see a similar output as below. The tenant `apache` has been successfully created. + + ```bash + + "apache" + "public" + "pulsar" + + ``` + +3. In the `toolset` container, create a namespace named `pulsar` in the tenant `apache`. + + ```bash + + bin/pulsar-admin namespaces create apache/pulsar + + ``` + + Then you can list the namespaces of tenant `apache` to see if the namespace is created successfully. + + ```bash + + bin/pulsar-admin namespaces list apache + + ``` + + You should see a similar output as below. The namespace `apache/pulsar` has been successfully created. + + ```bash + + "apache/pulsar" + + ``` + +4. In the `toolset` container, create a topic `test-topic` with `4` partitions in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics create-partitioned-topic apache/pulsar/test-topic -p 4 + + ``` + +5. In the `toolset` container, list all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics list-partitioned-topics apache/pulsar + + ``` + + Then you can see all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + "persistent://apache/pulsar/test-topic" + + ``` + +## Step 3: Use Pulsar client to produce and consume messages + +You can use the Pulsar client to create producers and consumers to produce and consume messages. + +By default, the Pulsar Helm chart exposes the Pulsar cluster through a Kubernetes `LoadBalancer`. In Minikube, you can use the following command to check the proxy service. + +```bash + +kubectl get services -n pulsar | grep pulsar-mini-proxy + +``` + +You will see a similar output as below. + +```bash + +pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 28m + +``` + +This output tells what are the node ports that Pulsar cluster's binary port and HTTP port are mapped to. The port after `80:` is the HTTP port while the port after `6650:` is the binary port. + +Then you can find the IP address and exposed ports of your Minikube server by running the following command. + +```bash + +minikube service pulsar-mini-proxy -n pulsar + +``` + +**Output** + +```bash + +|-----------|-------------------|-------------|-------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|-------------------------| +| pulsar | pulsar-mini-proxy | http/80 | http://172.17.0.4:32305 | +| | | pulsar/6650 | http://172.17.0.4:31816 | +|-----------|-------------------|-------------|-------------------------| +🏃 Starting tunnel for service pulsar-mini-proxy. +|-----------|-------------------|-------------|------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|------------------------| +| pulsar | pulsar-mini-proxy | | http://127.0.0.1:61853 | +| | | | http://127.0.0.1:61854 | +|-----------|-------------------|-------------|------------------------| + +``` + +At this point, you can get the service URLs to connect to your Pulsar client. Here are URL examples: + +``` + +webServiceUrl=http://127.0.0.1:61853/ +brokerServiceUrl=pulsar://127.0.0.1:61854/ + +``` + +Then you can proceed with the following steps: + +1. Download the Apache Pulsar tarball from the [downloads page](https://pulsar.apache.org/download/). + +2. Decompress the tarball based on your download file. + + ```bash + + tar -xf .tar.gz + + ``` + +3. Expose `PULSAR_HOME`. + + (1) Enter the directory of the decompressed download file. + + (2) Expose `PULSAR_HOME` as the environment variable. + + ```bash + + export PULSAR_HOME=$(pwd) + + ``` + +4. Configure the Pulsar client. + + In the `${PULSAR_HOME}/conf/client.conf` file, replace `webServiceUrl` and `brokerServiceUrl` with the service URLs you get from the above steps. + +5. Create a subscription to consume messages from `apache/pulsar/test-topic`. + + ```bash + + bin/pulsar-client consume -s sub apache/pulsar/test-topic -n 0 + + ``` + +6. Open a new terminal. In the new terminal, create a producer and send 10 messages to the `test-topic` topic. + + ```bash + + bin/pulsar-client produce apache/pulsar/test-topic -m "---------hello apache pulsar-------" -n 10 + + ``` + +7. Verify the results. + + - From the producer side + + **Output** + + The messages have been produced successfully. + + ```bash + + 18:15:15.489 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 10 messages successfully produced + + ``` + + - From the consumer side + + **Output** + + At the same time, you can receive the messages as below. + + ```bash + + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + + ``` + +## Step 4: Use Pulsar Manager to manage the cluster + +[Pulsar Manager](administration-pulsar-manager.md) is a web-based GUI management tool for managing and monitoring Pulsar. + +1. By default, the `Pulsar Manager` is exposed as a separate `LoadBalancer`. You can open the Pulsar Manager UI using the following command: + + ```bash + + minikube service -n pulsar pulsar-mini-pulsar-manager + + ``` + +2. The Pulsar Manager UI will be open in your browser. You can use the username `pulsar` and password `pulsar` to log into Pulsar Manager. + +3. In Pulsar Manager UI, you can create an environment. + + - Click `New Environment` button in the top-left corner. + - Type `pulsar-mini` for the field `Environment Name` in the popup window. + - Type `http://pulsar-mini-broker:8080` for the field `Service URL` in the popup window. + - Click `Confirm` button in the popup window. + +4. After successfully creating an environment, you are redirected to the `tenants` page of that environment. Then you can create `tenants`, `namespaces` and `topics` using the Pulsar Manager. + +## Step 5: Use Prometheus and Grafana to monitor cluster + +Grafana is an open-source visualization tool, which can be used for visualizing time series data into dashboards. + +1. By default, the Grafana is exposed as a separate `LoadBalancer`. You can open the Grafana UI using the following command: + + ```bash + + minikube service pulsar-mini-grafana -n pulsar + + ``` + +2. The Grafana UI is open in your browser. You can use the username `pulsar` and password `pulsar` to log into the Grafana Dashboard. + +3. You can view dashboards for different components of a Pulsar cluster. diff --git a/site2/website/versioned_docs/version-2.8.x/performance-pulsar-perf.md b/site2/website/versioned_docs/version-2.8.x/performance-pulsar-perf.md new file mode 100644 index 0000000000000..7b7f312bbb3ca --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/performance-pulsar-perf.md @@ -0,0 +1,227 @@ +--- +id: performance-pulsar-perf +title: Pulsar Perf +sidebar_label: "Pulsar Perf" +original_id: performance-pulsar-perf +--- + +The Pulsar Perf is a built-in performance test tool for Apache Pulsar. You can use the Pulsar Perf to test message writing or reading performance. For detailed information about performance tuning, see [here](https://streamnative.io/en/blog/tech/2021-01-14-pulsar-architecture-performance-tuning). + +## Produce messages + +This example shows how the Pulsar Perf produces messages with default options. For all configuration options available for the `pulsar-perf produce` command, see [configuration options](#configuration-options-for-pulsar-perf-produce). + +``` + +bin/pulsar-perf produce my-topic + +``` + +After the command is executed, the test data is continuously output on the Console. + +**Output** + +``` + +19:53:31.459 [pulsar-perf-producer-exec-1-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Created 1 producers +19:53:31.482 [pulsar-timer-5-1] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider +19:53:40.861 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 93.7 msg/s --- 0.7 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.575 ms - med: 3.460 - 95pct: 4.790 - 99pct: 5.308 - 99.9pct: 5.834 - 99.99pct: 6.609 - Max: 6.609 +19:53:50.909 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.437 ms - med: 3.328 - 95pct: 4.656 - 99pct: 5.071 - 99.9pct: 5.519 - 99.99pct: 5.588 - Max: 5.588 +19:54:00.926 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.376 ms - med: 3.276 - 95pct: 4.520 - 99pct: 4.939 - 99.9pct: 5.440 - 99.99pct: 5.490 - Max: 5.490 +19:54:10.940 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.298 ms - med: 3.220 - 95pct: 4.474 - 99pct: 4.926 - 99.9pct: 5.645 - 99.99pct: 5.654 - Max: 5.654 +19:54:20.956 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.1 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.308 ms - med: 3.199 - 95pct: 4.532 - 99pct: 4.871 - 99.9pct: 5.291 - 99.99pct: 5.323 - Max: 5.323 +19:54:30.972 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.249 ms - med: 3.144 - 95pct: 4.437 - 99pct: 4.970 - 99.9pct: 5.329 - 99.99pct: 5.414 - Max: 5.414 +19:54:40.987 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.435 ms - med: 3.361 - 95pct: 4.772 - 99pct: 5.150 - 99.9pct: 5.373 - 99.99pct: 5.837 - Max: 5.837 +^C19:54:44.325 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated throughput stats --- 7286 records sent --- 99.140 msg/s --- 0.775 Mbit/s +19:54:44.336 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated latency stats --- Latency: mean: 3.383 ms - med: 3.293 - 95pct: 4.610 - 99pct: 5.059 - 99.9pct: 5.588 - 99.99pct: 5.837 - 99.999pct: 6.609 - Max: 6.609 + +``` + +From the above test data, you can get the throughput statistics and the write latency statistics. The aggregated statistics is printed when the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. After the Pulsar Perf is stopped, the [HdrHistogram](http://hdrhistogram.github.io/HdrHistogram/) formatted test result appears under your directory. The document looks like `perf-producer-1589370810837.hgrm`. You can also check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). For details about how to check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html), see [HdrHistogram Plotter](#hdrhistogram-plotter). + +### Configuration options for `pulsar-perf produce` + +You can get all options by executing the `bin/pulsar-perf produce -h` command. Therefore, you can modify these options as required. + +The following table lists configuration options available for the `pulsar-perf produce` command. + +| Option | Description | Default value| +|----|----|----| +| access-mode | Set the producer access mode. Valid values are `Shared`, `Exclusive` and `WaitForExclusive`. | Shared | +| admin-url | Set the Pulsar admin URL. | N/A | +| auth-params | Set the authentication parameters, whose format is determined by the implementation of the `configure` method in the authentication plugin class, such as "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}". | N/A | +| auth_plugin | Set the authentication plugin class name. | N/A | +| listener-name | Set the listener name for the broker. | N/A | +| batch-max-bytes | Set the maximum number of bytes for each batch. | 4194304 | +| batch-max-messages | Set the maximum number of messages for each batch. | 1000 | +| batch-time-window | Set a window for a batch of messages. | 1 ms | +| busy-wait | Enable or disable Busy-Wait on the Pulsar client. | false | +| chunking | Configure whether to split the message and publish in chunks if message size is larger than allowed max size. | false | +| compression | Compress the message payload. | N/A | +| conf-file | Set the configuration file. | N/A | +| delay | Mark messages with a given delay. | 0s | +| encryption-key-name | Set the name of the public key used to encrypt the payload. | N/A | +| encryption-key-value-file | Set the file which contains the public key used to encrypt the payload. | N/A | +| exit-on-failure | Configure whether to exit from the process on publish failure. | false | +| format-class | Set the custom formatter class name. | org.apache.pulsar.testclient.DefaultMessageFormatter | +| format-payload | Configure whether to format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds. | false | +| help | Configure the help message. | false | +| max-connections | Set the maximum number of TCP connections to a single broker. | 100 | +| max-outstanding | Set the maximum number of outstanding messages. | 1000 | +| max-outstanding-across-partitions | Set the maximum number of outstanding messages across partitions. | 50000 | +| message-key-generation-mode | Set the generation mode of message key. Valid options are `autoIncrement`, `random`. | N/A | +| num-io-threads | Set the number of threads to be used for handling connections to brokers. | 1 | +| num-messages | Set the number of messages to be published in total. If it is set to 0, it keeps publishing messages. | 0 | +| num-producers | Set the number of producers for each topic. | 1 | +| num-test-threads | Set the number of test threads. | 1 | +| num-topic | Set the number of topics. | 1 | +| partitions | Configure whether to create partitioned topics with the given number of partitions. | N/A | +| payload-delimiter | Set the delimiter used to split lines when using payload from a file. | \n | +| payload-file | Use the payload from an UTF-8 encoded text file and a payload is randomly selected when messages are published. | N/A | +| producer-name | Set the producer name. | N/A | +| rate | Set the publish rate of messages across topics. | 100 | +| send-timeout | Set the sendTimeout. | 0 | +| separator | Set the separator between the topic and topic number. | - | +| service-url | Set the Pulsar service URL. | | +| size | Set the message size. | 1024 bytes | +| stats-interval-seconds | Set the statistics interval. If it is set to 0, statistics is disabled. | 0 | +| test-duration | Set the test duration. If it is set to 0, it keeps publishing tests. | 0s | +| trust-cert-file | Set the path for the trusted TLS certificate file. | | | +| warmup-time | Set the warm-up time. | 1s | +| tls-allow-insecure | Set the allowed insecure TLS connection. | N/A | + +## Consume messages + +This example shows how the Pulsar Perf consumes messages with default options. + +``` + +bin/pulsar-perf consume my-topic + +``` + +After the command is executed, the test data is continuously output on the Console. + +**Output** + +``` + +20:35:37.071 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Start receiving from 1 consumers on 1 topics +20:35:41.150 [pulsar-client-io-1-9] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider +20:35:47.092 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 59.572 msg/s -- 0.465 Mbit/s --- Latency: mean: 11.298 ms - med: 10 - 95pct: 15 - 99pct: 98 - 99.9pct: 137 - 99.99pct: 152 - Max: 152 +20:35:57.104 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.958 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.176 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 18 - Max: 18 +20:36:07.115 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.006 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.316 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:17.125 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.085 msg/s -- 0.782 Mbit/s --- Latency: mean: 9.327 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:27.136 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.900 msg/s -- 0.780 Mbit/s --- Latency: mean: 9.404 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:37.147 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.985 msg/s -- 0.781 Mbit/s --- Latency: mean: 8.998 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +^C20:36:42.755 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated throughput stats --- 6051 records received --- 92.125 msg/s --- 0.720 Mbit/s +20:36:42.759 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated latency stats --- Latency: mean: 9.422 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 98 - 99.99pct: 137 - 99.999pct: 152 - Max: 152 + +``` + +From the output test data, you can get the throughput statistics and the end-to-end latency statistics. The aggregated statistics is printed after the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. + +### Configuration options for `pulsar-perf consume` + +You can get all options by executing the `bin/pulsar-perf consume -h` command. Therefore, you can modify these options as required. + +The following table lists configuration options available for the `pulsar-perf consume` command. + +| Option | Description | Default value | +|----|----|----| +| acks-delay-millis | Set the acknowledgment grouping delay in milliseconds. | 100 ms | +| auth-params | Set the authentication parameters, whose format is determined by the implementation of the `configure` method in the authentication plugin class, such as "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}". | N/A | +| auth_plugin | Set the authentication plugin class name. | N/A | +| auto_ack_chunk_q_full | Configure whether to automatically ack for the oldest message in receiver queue if the queue is full. | false | +| listener-name | Set the listener name for the broker. | N/A | +| batch-index-ack | Enable or disable the batch index acknowledgment. | false | +| busy-wait | Enable or disable Busy-Wait on the Pulsar client. | false | +| conf-file | Set the configuration file. | N/A | +| encryption-key-name | Set the name of the public key used to encrypt the payload. | N/A | +| encryption-key-value-file | Set the file which contains the public key used to encrypt the payload. | N/A | +| help | Configure the help message. | false | +| expire_time_incomplete_chunked_messages | Set the expiration time for incomplete chunk messages (in milliseconds). | 0 | +| max-connections | Set the maximum number of TCP connections to a single broker. | 100 | +| max_chunked_msg | Set the max pending chunk messages. | 0 | +| num-consumers | Set the number of consumers for each topic. | 1 | +| num-io-threads |Set the number of threads to be used for handling connections to brokers. | 1 | +| num-subscriptions | Set the number of subscriptions (per topic). | 1 | +| num-topic | Set the number of topics. | 1 | +| pool-messages | Configure whether to use the pooled message. | true | +| rate | Simulate a slow message consumer (rate in msg/s). | 0.0 | +| receiver-queue-size | Set the size of the receiver queue. | 1000 | +| receiver-queue-size-across-partitions | Set the max total size of the receiver queue across partitions. | 50000 | +| replicated | Configure whether the subscription status should be replicated. | false | +| service-url | Set the Pulsar service URL. | | +| stats-interval-seconds | Set the statistics interval. If it is set to 0, statistics is disabled. | 0 | +| subscriber-name | Set the subscriber name prefix. | sub | +| subscription-position | Set the subscription position. Valid values are `Latest`, `Earliest`.| Latest | +| subscription-type | Set the subscription type.
  • Exclusive
  • Shared
  • Failover
  • Key_Shared
  • | Exclusive | +| test-duration | Set the test duration (in seconds). If the value is 0 or smaller than 0, it keeps consuming messages. | 0 | +| tls-allow-insecure | Set the allowed insecure TLS connection. | N/A | +| trust-cert-file | Set the path for the trusted TLS certificate file. | | | + +## Configurations + +By default, the Pulsar Perf uses `conf/client.conf` as the default configuration and uses `conf/log4j2.yaml` as the default Log4j configuration. If you want to connect to other Pulsar clusters, you can update the `brokerServiceUrl` in the client configuration. + +You can use the following commands to change the configuration file and the Log4j configuration file. + +``` + +export PULSAR_CLIENT_CONF= +export PULSAR_LOG_CONF= + +``` + +In addition, you can use the following command to configure the JVM configuration through environment variables: + +``` + +export PULSAR_EXTRA_OPTS='-Xms4g -Xmx4g -XX:MaxDirectMemorySize=4g' + +``` + +## HdrHistogram Plotter + +The [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html) is a visualization tool for checking Pulsar Perf test results, which makes it easier to observe the test results. + +To check test results through the HdrHistogram Plotter, follow these steps: + +1. Clone the HdrHistogram repository from GitHub to the local. + + ``` + + git clone https://github.com/HdrHistogram/HdrHistogram.git + + ``` + +2. Switch to the HdrHistogram folder. + + ``` + + cd HdrHistogram + + ``` + +3. Install the HdrHistogram Plotter. + + ``` + + mvn clean install -DskipTests + + ``` + +4. Transform the file generated by the Pulsar Perf. + + ``` + + ./HistogramLogProcessor -i -o + + ``` + +5. You will get two output files. Upload the output file with the filename extension of .hgrm to the [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). + +6. Check the test result through the Graphical User Interface of the HdrHistogram Plotter, as shown blow. + + ![](/assets/perf-produce.png) diff --git a/site2/website/versioned_docs/version-2.8.x/reference-cli-tools.md b/site2/website/versioned_docs/version-2.8.x/reference-cli-tools.md new file mode 100644 index 0000000000000..17422db701a00 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-cli-tools.md @@ -0,0 +1,959 @@ +--- +id: reference-cli-tools +title: Pulsar command-line tools +sidebar_label: "Pulsar CLI tools" +original_id: reference-cli-tools +--- + +Pulsar offers several command-line tools that you can use for managing Pulsar installations, performance testing, using command-line producers and consumers, and more. + +All Pulsar command-line tools can be run from the `bin` directory of your [installed Pulsar package](getting-started-standalone.md). The following tools are currently documented: + +* [`pulsar`](#pulsar) +* [`pulsar-client`](#pulsar-client) +* [`pulsar-daemon`](#pulsar-daemon) +* [`pulsar-perf`](#pulsar-perf) +* [`bookkeeper`](#bookkeeper) +* [`broker-tool`](#broker-tool) + +> ### Getting help +> You can get help for any CLI tool, command, or subcommand using the `--help` flag, or `-h` for short. Here's an example: + +> ```shell +> +> $ bin/pulsar broker --help +> +> +> ``` + + +## `pulsar` + +The pulsar tool is used to start Pulsar components, such as bookies and ZooKeeper, in the foreground. + +These processes can also be started in the background, using nohup, using the pulsar-daemon tool, which has the same command interface as pulsar. + +Usage: + +```bash + +$ pulsar command + +``` + +Commands: +* `bookie` +* `broker` +* `compact-topic` +* `discovery` +* `configuration-store` +* `initialize-cluster-metadata` +* `proxy` +* `standalone` +* `websocket` +* `zookeeper` +* `zookeeper-shell` + +Example: + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +The table below lists the environment variables that you can use to configure the `pulsar` tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|`conf/log4j2.yaml`| +|`PULSAR_BROKER_CONF`|Configuration file for broker|`conf/broker.conf`| +|`PULSAR_BOOKKEEPER_CONF`|description: Configuration file for bookie|`conf/bookkeeper.conf`| +|`PULSAR_ZK_CONF`|Configuration file for zookeeper|`conf/zookeeper.conf`| +|`PULSAR_CONFIGURATION_STORE_CONF`|Configuration file for the configuration store|`conf/global_zookeeper.conf`| +|`PULSAR_DISCOVERY_CONF`|Configuration file for discovery service|`conf/discovery.conf`| +|`PULSAR_WEBSOCKET_CONF`|Configuration file for websocket proxy|`conf/websocket.conf`| +|`PULSAR_STANDALONE_CONF`|Configuration file for standalone|`conf/standalone.conf`| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the jvm|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_PID_DIR`|Folder where the pulsar server PID file should be stored|| +|`PULSAR_STOP_TIMEOUT`|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + + +### `bookie` + +Starts up a bookie server + +Usage: + +```bash + +$ pulsar bookie options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-readOnly`|Force start a read-only bookie server|false| +|`-withAutoRecovery`|Start auto-recover service bookie server|false| + + +Example + +```bash + +$ PULSAR_BOOKKEEPER_CONF=/path/to/bookkeeper.conf pulsar bookie \ + -readOnly \ + -withAutoRecovery + +``` + +### `broker` + +Starts up a Pulsar broker + +Usage + +```bash + +$ pulsar broker options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-bc` , `--bookie-conf`|Configuration file for BookKeeper|| +|`-rb` , `--run-bookie`|Run a BookKeeper bookie on the same host as the Pulsar broker|false| +|`-ra` , `--run-bookie-autorecovery`|Run a BookKeeper autorecovery daemon on the same host as the Pulsar broker|false| + +Example + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +### `compact-topic` + +Run compaction against a Pulsar topic (in a new process) + +Usage + +```bash + +$ pulsar compact-topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t` , `--topic`|The Pulsar topic that you would like to compact|| + +Example + +```bash + +$ pulsar compact-topic --topic topic-to-compact + +``` + +### `discovery` + +Run a discovery server + +Usage + +```bash + +$ pulsar discovery + +``` + +Example + +```bash + +$ PULSAR_DISCOVERY_CONF=/path/to/discovery.conf pulsar discovery + +``` + +### `configuration-store` + +Starts up the Pulsar configuration store + +Usage + +```bash + +$ pulsar configuration-store + +``` + +Example + +```bash + +$ PULSAR_CONFIGURATION_STORE_CONF=/path/to/configuration_store.conf pulsar configuration-store + +``` + +### `initialize-cluster-metadata` + +One-time cluster metadata initialization + +Usage + +```bash + +$ pulsar initialize-cluster-metadata options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-ub` , `--broker-service-url`|The broker service URL for the new cluster|| +|`-tb` , `--broker-service-url-tls`|The broker service URL for the new cluster with TLS encryption|| +|`-c` , `--cluster`|Cluster name|| +|`-cs` , `--configuration-store`|The configuration store quorum connection string|| +|`--existing-bk-metadata-service-uri`|The metadata service URI of the existing BookKeeper cluster that you want to use|| +|`-h` , `--help`|Cluster name|false| +|`--initial-num-stream-storage-containers`|The number of storage containers of BookKeeper stream storage|16| +|`--initial-num-transaction-coordinators`|The number of transaction coordinators assigned in a cluster|16| +|`-uw` , `--web-service-url`|The web service URL for the new cluster|| +|`-tw` , `--web-service-url-tls`|The web service URL for the new cluster with TLS encryption|| +|`-zk` , `--zookeeper`|The local ZooKeeper quorum connection string|| +|`--zookeeper-session-timeout-ms`|The local ZooKeeper session timeout. The time unit is in millisecond(ms)|30000| + + +### `proxy` + +Manages the Pulsar proxy + +Usage + +```bash + +$ pulsar proxy options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--configuration-store`|Configuration store connection string|| +|`-zk` , `--zookeeper-servers`|Local ZooKeeper connection string|| + +Example + +```bash + +$ PULSAR_PROXY_CONF=/path/to/proxy.conf pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk2 \ + --configuration-store zk-0,zk-1,zk-2 + +``` + +### `standalone` + +Run a broker service with local bookies and local ZooKeeper + +Usage + +```bash + +$ pulsar standalone options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a` , `--advertised-address`|The standalone broker advertised address|| +|`--bookkeeper-dir`|Local bookies’ base data directory|data/standalone/bookkeeper| +|`--bookkeeper-port`|Local bookies’ base port|3181| +|`--no-broker`|Only start ZooKeeper and BookKeeper services, not the broker|false| +|`--num-bookies`|The number of local bookies|1| +|`--only-broker`|Only start the Pulsar broker service (not ZooKeeper or BookKeeper)|| +|`--wipe-data`|Clean up previous ZooKeeper/BookKeeper data|| +|`--zookeeper-dir`|Local ZooKeeper’s data directory|data/standalone/zookeeper| +|`--zookeeper-port` |Local ZooKeeper’s port|2181| + +Example + +```bash + +$ PULSAR_STANDALONE_CONF=/path/to/standalone.conf pulsar standalone + +``` + +### `websocket` + +Usage + +```bash + +$ pulsar websocket + +``` + +Example + +```bash + +$ PULSAR_WEBSOCKET_CONF=/path/to/websocket.conf pulsar websocket + +``` + +### `zookeeper` + +Starts up a ZooKeeper cluster + +Usage + +```bash + +$ pulsar zookeeper + +``` + +Example + +```bash + +$ PULSAR_ZK_CONF=/path/to/zookeeper.conf pulsar zookeeper + +``` + +### `zookeeper-shell` + +Connects to a running ZooKeeper cluster using the ZooKeeper shell + +Usage + +```bash + +$ pulsar zookeeper-shell options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| +|`-server`|Configuration zk address, eg: `127.0.0.1:2181`|| + + + +## `pulsar-client` + +The pulsar-client tool + +Usage + +```bash + +$ pulsar-client command + +``` + +Commands +* `produce` +* `consume` + + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{\"key1\":\"val1\",\"key2\":\"val2\"}"|{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}| +|`--auth-plugin`|Authentication plugin class name|org.apache.pulsar.client.impl.auth.AuthenticationSasl| +|`--listener-name`|Listener name for the broker|| +|`--proxy-protocol`|Proxy protocol to select type of routing at proxy|| +|`--proxy-url`|Proxy-server URL to which to connect|| +|`--url`|Broker URL to which to connect|pulsar://localhost:6650/
    ws://localhost:8080 | +| `-v`, `--version` | Get the version of the Pulsar client +|`-h`, `--help`|Show this help + + +### `produce` +Send a message or messages to a specific broker and topic + +Usage + +```bash + +$ pulsar-client produce topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-f`, `--files`|Comma-separated file paths to send; either -m or -f must be specified|[]| +|`-m`, `--messages`|Comma-separated string of messages to send; either -m or -f must be specified|[]| +|`-n`, `--num-produce`|The number of times to send the message(s); the count of messages/files * num-produce should be below 1000|1| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| +|`-db`, `--disable-batching`|Disable batch sending of messages.
    **Note:** This flag is only available in 2.8.2 and later versions. |false| +|`-c`, `--chunking`|Split the message and publish in chunks if the message size is larger than the allowed max size|false| +|`-s`, `--separator`|Character to split messages string with.|","| +|`-k`, `--key`|Message key to add|key=value string, like k1=v1,k2=v2.| +|`-p`, `--properties`|Properties to add. If you want to add multiple properties, use the comma as the separator, e.g. `k1=v1,k2=v2`.| | +|`-ekn`, `--encryption-key-name`|The public key name to encrypt payload.| | +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | + + +### `consume` +Consume messages from a specific broker and topic + +Usage + +```bash + +$ pulsar-client consume topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--hex`|Display binary messages in hexadecimal format.|false| +|`-n`, `--num-messages`|Number of messages to consume, 0 means to consume forever.|1| +|`-r`, `--rate`|Rate (in messages per second) at which to consume; a value 0 means to consume messages as fast as possible|0.0| +|`--regex`|Indicate the topic name is a regex pattern|false| +|`-s`, `--subscription-name`|Subscription name|| +|`-t`, `--subscription-type`|The type of the subscription. Possible values: Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-p`, `--subscription-position`|The position of the subscription. Possible values: Latest, Earliest.|Latest| +|`-m`, `--subscription-mode`|Subscription mode. Possible values: Durable, NonDurable.|Durable| +|`-q`, `--queue-size`|The size of consumer's receiver queue.|0| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages.|0| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full.|false| +|`--hide-content`|Do not print the message to the console.|false| +|`-st`, `--schema-type`|Set the schema type. Use `auto_consume` to dump AVRO and other structured data types. Possible values: bytes, auto_consume.|bytes| +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | +|`-pm`, `--pool-messages`|Use the pooled message.|true| + +## `pulsar-daemon` +A wrapper around the pulsar tool that’s used to start and stop processes, such as ZooKeeper, bookies, and Pulsar brokers, in the background using nohup. + +pulsar-daemon has a similar interface to the pulsar command but adds start and stop commands for various services. For a listing of those services, run pulsar-daemon to see the help output or see the documentation for the pulsar command. + +Usage + +```bash + +$ pulsar-daemon command + +``` + +Commands +* `start` +* `stop` + + +### `start` +Start a service in the background using nohup. + +Usage + +```bash + +$ pulsar-daemon start service + +``` + +### `stop` +Stop a service that’s already been started using start. + +Usage + +```bash + +$ pulsar-daemon stop service options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|-force|Stop the service forcefully if not stopped by normal shutdown.|false| + + + +## `pulsar-perf` +A tool for performance testing a Pulsar broker. + +Usage + +```bash + +$ pulsar-perf command + +``` + +Commands +* `consume` +* `produce` +* `read` +* `websocket-producer` +* `managed-ledger` +* `monitor-brokers` +* `simulation-client` +* `simulation-controller` +* `help` + +Environment variables + +The table below lists the environment variables that you can use to configure the pulsar-perf tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|conf/log4j2.yaml| +|`PULSAR_CLIENT_CONF`|Configuration file for the client|conf/client.conf| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the JVM|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| + + +### `consume` +Run a consumer + +Usage + +``` + +$ pulsar-perf consume options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth_plugin`|Authentication plugin class name|| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full|false| +|`--listener-name`|Listener name for the broker|| +|`--acks-delay-millis`|Acknowledgements grouping delay in millis|100| +|`--batch-index-ack`|Enable or disable the batch index acknowledgment|false| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-v`, `--encryption-key-value-file`|The file which contains the private key to decrypt payload|| +|`-h`, `--help`|Help message|false| +|`--conf-file`|Configuration file|| +|`-e`, `--expire_time_incomplete_chunked_messages`|The expiration time for incomplete chunk messages (in milliseconds)|0| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages|0| +|`-n`, `--num-consumers`|Number of consumers (per topic)|1| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-ns`, `--num-subscriptions`|Number of subscriptions (per topic)|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-pm`, `--pool-messages`|Use the pooled message|true| +|`-r`, `--rate`|Simulate a slow message consumer (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-p`, `--receiver-queue-size-across-partitions`|Max total size of the receiver queue across partitions|50000| +|`--replicated`|Whether the subscription status should be replicated|false| +|`-u`, `--service-url`|Pulsar service URL|| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled|0| +|`-s`, `--subscriber-name`|Subscriber name prefix.
    **Note:** This flag is deprecated in 2.8.2 and later versions. |sub| +|`-ss`, `--subscriptions`|A list of subscriptions to consume on (e.g. sub1,sub2)|sub| +|`-st`, `--subscription-type`|Subscriber type. Possible values are Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-sp`, `--subscription-position`|Subscriber position. Possible values are Latest, Earliest.|Latest| +|`-time`, `--test-duration`|Test duration (in seconds). If the value is 0 or smaller than 0, it keeps consuming messages|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + + +### `produce` +Run a producer + +Usage + +```bash + +$ pulsar-perf produce options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-am`, `--access-mode`|Producer access mode. Valid values are `Shared`, `Exclusive` and `WaitForExclusive`|Shared| +|`-au`, `--admin-url`|Pulsar admin URL|| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-bb`, `--batch-max-bytes`|Maximum number of bytes per batch|4194304| +|`-bm`, `--batch-max-messages`|Maximum number of messages per batch|1000| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-ch`, `--chunking`|Split the message and publish in chunks if the message size is larger than allowed max size|false| +|`-d`, `--delay`|Mark messages with a given delay in seconds|0s| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, ZLIB, ZSTD or SNAPPY.|| +|`--conf-file`|Configuration file|| +|`-k`, `--encryption-key-name`|The public key name to encrypt payload|| +|`-v`, `--encryption-key-value-file`|The file which contains the public key to encrypt payload|| +|`-ef`, `--exit-on-failure`|Exit from the process on publish failure|false| +|`-fc`, `--format-class`|Custom Formatter class name|org.apache.pulsar.testclient.DefaultMessageFormatter| +|`-fp`, `--format-payload`|Format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds|false| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-p`, `--max-outstanding-across-partitions`|Max number of outstanding messages across partitions|50000| +|`-mk`, `--message-key-generation-mode`|The generation mode of message key. Valid options are `autoIncrement`, `random`|| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-m`, `--num-messages`|Number of messages to publish in total. If the value is 0 or smaller than 0, it keeps publishing messages.|0| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-threads`, `--num-test-threads`|Number of test threads|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-np`, `--partitions`|Create partitioned topics with the given number of partitions. Setting this value to 0 means not trying to create a topic|| +|`-f`, `--payload-file`|Use payload from an UTF-8 encoded text file and a payload will be randomly selected when publishing messages|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-pn`, `--producer-name`|Producer Name|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`--send-timeout`|Set the sendTimeout|0| +|`--separator`|Separator between the topic and topic number|-| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If the value is 0 or smaller than 0, it keeps publishing messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--warmup-time`|Warm-up time in seconds|1| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + + +### `read` +Run a topic reader + +Usage + +```bash + +$ pulsar-perf read options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-r`, `--rate`|Simulate a slow message reader (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-u`, `--service-url`|Pulsar service URL|| +|`-m`, `--start-message-id`|Start message id. This can be either 'earliest', 'latest' or a specific message id by using 'lid:eid'|earliest| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If the value is 0 or smaller than 0, it keeps consuming messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--use-tls`|Use TLS encryption on the connection|false| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + +### `websocket-producer` +Run a websocket producer + +Usage + +```bash + +$ pulsar-perf websocket-producer options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-m`, `--num-messages`|Number of messages to publish in total. If the value is 0 or smaller than 0, it keeps publishing messages|0| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of empty buffer|| +|`-u`, `--proxy-url`|Pulsar Proxy URL, e.g., "ws://localhost:8080/"|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If the value is 0 or smaller than 0, it keeps publishing messages|0| + + +### `managed-ledger` +Write directly on managed-ledgers + +Usage + +```bash + +$ pulsar-perf managed-ledger options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a`, `--ack-quorum`|Ledger ack quorum|1| +|`-dt`, `--digest-type`|BookKeeper digest type. Possible Values: [CRC32, MAC, CRC32C, DUMMY]|CRC32C| +|`-e`, `--ensemble-size`|Ledger ensemble size|1| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single bookie|1| +|`-o`, `--max-outstanding`|Max number of outstanding requests|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If the value is 0 or smaller than 0, it keeps publishing messages|0| +|`-t`, `--num-topic`|Number of managed ledgers|1| +|`-r`, `--rate`|Write rate msg/s across managed ledgers|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If the value is 0 or smaller than 0, it keeps publishing messages|0| +|`--threads`|Number of threads writing|1| +|`-w`, `--write-quorum`|Ledger write quorum|1| +|`-zk`, `--zookeeperServers`|ZooKeeper connection string|| + + +### `monitor-brokers` +Continuously receive broker data and/or load reports + +Usage + +```bash + +$ pulsar-perf monitor-brokers options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--connect-string`|A connection string for one or more ZooKeeper servers|| +|`-h`, `--help`|Help message|false| + + +### `simulation-client` +Run a simulation server acting as a Pulsar client. Uses the client configuration specified in `conf/client.conf`. + +Usage + +```bash + +$ pulsar-perf simulation-client options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--port`|Port to listen on for controller|0| +|`--service-url`|Pulsar Service URL|| +|`-h`, `--help`|Help message|false| + +### `simulation-controller` +Run a simulation controller to give commands to servers + +Usage + +```bash + +$ pulsar-perf simulation-controller options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--client-port`|The port that the clients are listening on|0| +|`--clients`|Comma-separated list of client hostnames|| +|`--cluster`|The cluster to test on|| +|`-h`, `--help`|Help message|false| + + +### `help` +This help message + +Usage + +```bash + +$ pulsar-perf help + +``` + +## `bookkeeper` +A tool for managing BookKeeper. + +Usage + +```bash + +$ bookkeeper command + +``` + +Commands +* `autorecovery` +* `bookie` +* `localbookie` +* `upgrade` +* `shell` + + +Environment variables + +The table below lists the environment variables that you can use to configure the bookkeeper tool. + +|Variable|Description|Default| +|---|---|---| +|BOOKIE_LOG_CONF|Log4j configuration file|conf/log4j2.yaml| +|BOOKIE_CONF|BookKeeper configuration file|conf/bk_server.conf| +|BOOKIE_EXTRA_OPTS|Extra options to be passed to the JVM|| +|BOOKIE_EXTRA_CLASSPATH|Extra paths for BookKeeper's classpath|| +|ENTRY_FORMATTER_CLASS|The Java class used to format entries|| +|BOOKIE_PID_DIR|Folder where the BookKeeper server PID file should be stored|| +|BOOKIE_STOP_TIMEOUT|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + +### `auto-recovery` +Runs an auto-recovery service + +Usage + +```bash + +$ bookkeeper autorecovery options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| + + +### `bookie` +Starts up a BookKeeper server (aka bookie) + +Usage + +```bash + +$ bookkeeper bookie options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| +|-readOnly|Force start a read-only bookie server|false| +|-withAutoRecovery|Start auto-recovery service bookie server|false| + + +### `localbookie` +Runs a test ensemble of N bookies locally + +Usage + +```bash + +$ bookkeeper localbookie N + +``` + +### `upgrade` +Upgrade the bookie’s filesystem + +Usage + +```bash + +$ bookkeeper upgrade options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery|| +|`-u`, `--upgrade`|Upgrade the bookie’s directories|| + + +### `shell` +Run shell for admin commands. To see a full listing of those commands, run bookkeeper shell without an argument. + +Usage + +```bash + +$ bookkeeper shell + +``` + +Example + +```bash + +$ bookkeeper shell bookiesanity + +``` + +## `broker-tool` + +The `broker- tool` is used for operations on a specific broker. + +Usage + +```bash + +$ broker-tool command + +``` + +Commands +* `load-report` +* `help` + +Example +Two ways to get more information about a command as below: + +```bash + +$ broker-tool help command +$ broker-tool command --help + +``` + +### `load-report` + +Collect the load report of a specific broker. +The command is run on a broker, and used for troubleshooting why broker can’t collect right load report. + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--interval`| Interval to collect load report, in milliseconds || +|`-h`, `--help`| Display help information || + diff --git a/site2/website/versioned_docs/version-2.8.x/reference-configuration.md b/site2/website/versioned_docs/version-2.8.x/reference-configuration.md new file mode 100644 index 0000000000000..25bb33296ca35 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-configuration.md @@ -0,0 +1,828 @@ +--- +id: reference-configuration +title: Pulsar configuration +sidebar_label: "Pulsar configuration" +original_id: reference-configuration +--- + + + + +You can manage Pulsar configuration by configuration files in the [`conf`](https://github.com/apache/pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md). + +- [BookKeeper](#bookkeeper) +- [Broker](#broker) +- [Client](#client) +- [Service discovery](#service-discovery) +- [Log4j](#log4j) +- [Log4j shell](#log4j-shell) +- [Standalone](#standalone) +- [WebSocket](#websocket) +- [Pulsar proxy](#pulsar-proxy) +- [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (that is the interface used to establish its identity). By default, loopback interfaces are not allowed to work as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. By default, the bookie listens on all interfaces.|eth0| +|advertisedAddress|Configure a specific hostname or IP address that the bookie should use to advertise itself to clients. By default, the bookie advertises either its own IP address or hostname according to the `listeningInterface` and `useHostNameAsBookieID` settings.|N/A| +|allowMultipleDirsUnderSameDiskPartition|Configure the bookie to enable/disable multiple ledger/index/journal directories in the same filesystem disk partition.|false| +|minUsableSizeForIndexFileCreation|The minimum safe usable size available in index directory for bookie to create index files while replaying journal at the time of bookie starts in Readonly Mode (in bytes).|1073741824| +|journalDirectory|The directory where BookKeeper outputs its write-ahead log (WAL).|data/bookkeeper/journal| +|journalDirectories|Directories that BookKeeper outputs its write ahead log. Multiple directories are available, being separated by `,`. For example: `journalDirectories=/tmp/bk-journal1,/tmp/bk-journal2`. If `journalDirectories` is set, the bookies skip `journalDirectory` and use this setting directory.|/tmp/bk-journal| +|ledgerDirectories|The directory where BookKeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by `,`, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled. Note: should be greater than gcWaitTime. |3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled. Note: should be greater than gcWaitTime. |86400| +|readOnlyModeEnabled|If `readOnlyModeEnabled=true`, then on all full ledger disks, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown.|true| +|forceReadOnlyBookie|Whether the bookie is force started in read only mode.|false| +|persistBookieStatusEnabled|Persist the bookie status locally on the disks. So the bookies can keep their status upon restarts.|false| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction reads entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal files to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|openLedgerRereplicationGracePeriod | The grace period, in milliseconds, that the replication worker waits before fencing and replicating a ledger fragment that's still being written to upon bookie failure. | 30000 | +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|100| +|autoRecoveryDaemonEnabled|Whether the bookie itself can start auto-recovery service.|true| +|lostBookieRecoveryDelay|How long to wait, in seconds, before starting auto recovery of a lost bookie.|0| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|allowStorageExpansion|Allow the bookie storage to expand. Newly added ledger and index dirs must be empty.|false| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|zkRetryBackoffStartMs|The start time that the Zookeeper client backoff retries in milliseconds.|1000| +|zkRetryBackoffMaxMs|The maximum time that the Zookeeper client backoff retries in milliseconds.|10000| +|zkEnableSecurity|Set ACLs on every node written on ZooKeeper, allowing users to read and write BookKeeper metadata stored on ZooKeeper. In order to make ACLs work you need to setup ZooKeeper JAAS authentication. All the bookies and Client need to share the same user, and this is usually done using Kerberos authentication. See ZooKeeper documentation.|false| +|httpServerEnabled|The flag enables/disables starting the admin http server.|false| +|httpServerPort|The HTTP server port to listen on. By default, the value is `8080`. If you want to keep it consistent with the Prometheus stats provider, you can set it to `8000`.|8080 +|httpServerClass|The http server class.|org.apache.bookkeeper.http.vertx.VertxHttpServer| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|serverSockKeepalive|This setting is used to send keep-alive messages on connection-oriented sockets.|true| +|serverTcpLinger|The socket linger timeout on close. When enabled, a close or shutdown will not return until all queued messages for the socket have been successfully sent or the linger timeout has been reached. Otherwise, the call returns immediately and the closing is done in the background.|0| +|byteBufAllocatorSizeMax|The maximum buf size of the received ByteBuf allocator.|1048576| +|nettyMaxFrameSizeBytes|The maximum netty frame size in bytes. Any message received larger than this will be rejected.|5253120| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficient when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain better performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partitions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|sortedLedgerStorageEnabled|Whether sorted-ledger storage is enabled.|true| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|The number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|The number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|numHighPriorityWorkerThreads|The umber of threads that should be used for high priority requests (i.e. recovery reads and adds, and fencing).|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|maxPendingAddRequestsPerThread|The limited number of pending requests, which is used to avoid the executor queue to grow indefinitely when add workers threads are enabled.|10000| +|isForceGCAllowWhenNoSpace|Whether force compaction is allowed when the disk is full or almost full. Forcing GC could get some space back, but could also fill up the disk space more quickly. This is because new log files are created before GC, while old garbage log files are deleted after GC.|false| +|verifyMetadataOnGC|True if the bookie should double check `readMetadata` prior to GC.|false| +|flushEntrylogBytes|Entry log flush interval in bytes. Flushing in smaller chunks but more frequently reduces spikes in disk I/O. Flushing too frequently may also affect performance negatively.|268435456| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ip address for the registration.|false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for the bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`). If you set the `bookieId`, then the `useHostNameAsBookieID` does not take effect.

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|N/A| +|allowEphemeralPorts|Whether the bookie is allowed to use an ephemeral port (port 0) as its server port. By default, an ephemeral port is not allowed. Using an ephemeral port as the service port usually indicates a configuration error. However, in unit tests, using an ephemeral port will address port conflict problems and allow running tests in parallel.|false| +|enableLocalTransport|Whether the bookie is allowed to listen for the BookKeeper clients executed on the local JVM.|false| +|disableServerSocketBind|Whether the bookie is allowed to disable bind on network interfaces. This bookie will be available only to BookKeeper clients executed on the local JVM.|false| +|skipListArenaChunkSize|The number of bytes that we should use as chunk allocation for `org.apache.bookkeeper.bookie.SkipListArena`.|4194304| +|skipListArenaMaxAllocSize|The maximum size that we should allocate from the skiplist arena. Allocations larger than this should be allocated directly by the VM to avoid fragmentation.|131072| +|bookieAuthProviderFactoryClass|The factory class name of the bookie authentication provider. If this is null, then there is no authentication.|null| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log. For good performance, it should be big enough to hold a substantial amount of entries in the flush interval.|25% of direct memory| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens. By default, it is allocated to 25% of the available direct memory.|N/A| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases. By default, it uses 10% of direct memory.|N/A| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|advertisedListeners|Specify multiple advertised listeners for the broker.

    The format is `:pulsar://:`.

    If there are multiple listeners, separate them with commas.

    **Note**: do not use this configuration with `advertisedAddress` and `brokerServicePort`. If the value of this configuration is empty, the broker uses `advertisedAddress` and `brokerServicePort`|/| +|internalListenerName|Specify the internal listener name for the broker.

    **Note**: the listener name must be contained in `advertisedListeners`.

    If the value of this configuration is empty, the broker uses the first listener as the internal listener.|/| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|exposePublisherStats|Whether to enable topic level metrics.|true| +|statsUpdateFrequencyInSecs||60| +|statsUpdateInitialDelayInSecs||60| +|zookeeperServers| Zookeeper quorum connection string || +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|webSocketNumIoThreads|The number of IO threads in Pulsar Client used in WebSocket proxy.|8| +|webSocketConnectionsPerBroker|The number of connections per Broker in Pulsar Client used in WebSocket proxy.|8| +|webSocketSessionIdleTimeoutMillis|Time in milliseconds that idle WebSocket session times out.|300000| +|webSocketMaxTextFrameSize|The maximum size of a text message during parsing in WebSocket proxy.|1048576| +|exposeTopicLevelMetricsInPrometheus|Whether to enable topic level metrics.|true| +|exposeConsumerLevelMetricsInPrometheus|Whether to enable consumer level metrics.|false| +|jvmGCMetricsLoggerClassName|Classname of Pluggable JVM GC metrics logger that can log GC specific metrics.|N/A| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|maxTenants|The maximum number of tenants that can be created in each Pulsar cluster. When the number of tenants reaches the threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +| maxNamespacesPerTenant | The maximum number of namespaces that can be created in each tenant. When the number of namespaces reaches this threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationSnapshotIntervalSeconds| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |120| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|dispatchThrottlingRatePerReplicatorInMsg| The default messages per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message dispatch-throttling| 0 | +|dispatchThrottlingRatePerReplicatorInByte| The default bytes per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message-byte dispatch-throttling| 0 | +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitGB| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. | -1 | +|backlogQuotaDefaultRetentionPolicy|The defaulted backlog quota retention policy. By Default, it is `producer_request_hold`.
  • 'producer_request_hold' Policy which holds producer's send request until the resource becomes available (or holding times out)
  • 'producer_exception' Policy which throws `javax.jms.ResourceAllocationException` to the producer
  • 'consumer_backlog_eviction' Policy which evicts the oldest message from the slowest consumer's backlog
  • |producer_request_hold| +|allowAutoTopicCreation| Enable topic auto creation if a new producer or consumer connected |true| +|allowAutoTopicCreationType| The type of topic that is allowed to be automatically created.(partitioned/non-partitioned) |non-partitioned| +|allowAutoSubscriptionCreation| Enable subscription auto creation if a new consumer connected |true| +|defaultNumPartitions| The number of partitioned topics that is allowed to be automatically created if `allowAutoTopicCreationType` is partitioned |1| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +| brokerDeleteInactiveTopicsMode | Set the mode to delete inactive topics.
  • `delete_when_no_subscriptions`: delete the topic which has no subscriptions or active producers.
  • `delete_when_subscriptions_caught_up`: delete the topic whose subscriptions have no backlogs and which has no active producers or consumers.
  • | `delete_when_no_subscriptions` | +| brokerDeleteInactiveTopicsMaxInactiveDurationSeconds | Set the maximum duration for inactive topics. If it is not specified, the `brokerDeleteInactiveTopicsFrequencySeconds` parameter is adopted. | N/A | +|forceDeleteTenantAllowed| Enable you to delete a tenant forcefully. |false| +|forceDeleteNamespaceAllowed| Enable you to delete a namespace forcefully. |false| +|messageExpiryCheckIntervalInMinutes| The frequency of proactively checking and purging expired messages. |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to determine whether topics with compaction policies need compaction. |60| +brokerServiceCompactionThresholdInBytes|If the estimated backlog size is greater than this threshold, compression is triggered.

    Set this threshold to 0 means disabling the compression check.|N/A +|delayedDeliveryEnabled| Whether to enable the delayed delivery for messages. If disabled, messages will be immediately delivered and there will be no tracking overhead.|true| +|delayedDeliveryTickTimeMillis|Control the tick time for retrying on delayed delivery, which affects the accuracy of the delivery time compared to the scheduled time. By default, it is 1 second.|1000| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|maxNumPartitionsPerPartitionedTopic|Max number of partitions per partitioned topic. Use 0 or negative number to disable the check|0| +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +|tlsEnabled|Deprecated - Use `webServicePortTls` and `brokerServicePortTls` instead. |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file. This cert is used to verify that any certs presented by connecting clients are signed by a certificate authority. If this verification fails, then the certs are untrusted and the connections are dropped. || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client. If it is set to `true`, a client with a cert which cannot be verified with the 'tlsTrustCertsFilePath' cert will be allowed to connect to the server, though the cert will not be used for client authentication. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tlsEnabledWithKeyStore| Enable TLS with KeyStore type configuration in broker |false| +|tlsProvider| TLS Provider for KeyStore type || +|tlsKeyStoreType| LS KeyStore type configuration in broker: JKS, PKCS12 |JKS| +|tlsKeyStore| TLS KeyStore path in broker || +|tlsKeyStorePassword| TLS KeyStore password for broker || +|brokerClientTlsEnabledWithKeyStore| Whether internal client use KeyStore type to authenticate with Pulsar brokers |false| +|brokerClientSslProvider| The TLS Provider used by internal client to authenticate with other Pulsar brokers || +|brokerClientTlsTrustStoreType| TLS TrustStore type configuration for internal client: JKS, PKCS12, used by the internal client to authenticate with Pulsar brokers |JKS| +|brokerClientTlsTrustStore| TLS TrustStore path for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsTrustStorePassword| TLS TrustStore password for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsCiphers| Specify the tls cipher the internal client will use to negotiate during TLS Handshake. (a comma-separated list of ciphers) e.g. [TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256]|| +|brokerClientTlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS handshake. (a comma-separated list of protocol names). e.g. `TLSv1.3`, `TLSv1.2` || +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify which of the token's claims will be used as the authentication "principal" or "role". The default "sub" claim will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud", that will be used to get the audience from token. If not set, audience will not be verified. || +|tokenAudience| The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token, need contains this. || +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|subscriptionRedeliveryTrackerEnabled| Enable subscription message redelivery tracker |true| +|subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consuming.

    Setting this configuration to a value **greater than 0** deletes inactive subscriptions automatically.
    Setting this configuration to **0** does not delete inactive subscriptions automatically.

    Since this configuration takes effect on all topics, if there is even one topic whose subscriptions should not be deleted automatically, you need to set it to 0.
    Instead, you can set a subscription expiration time for each **namespace** using the [`pulsar-admin namespaces set-subscription-expiration-time options` command](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-subscription-expiration-time-em-). | 0 | +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |50000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names || +| authenticationRefreshCheckSeconds | Interval of time for checking for expired authentication credentials | 60 | +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|schemaRegistryStorageClassName|The schema storage implementation used by this broker.|org.apache.pulsar.broker.service.schema.BookkeeperSchemaStorageFactory| +|isSchemaValidationEnforced|Enforce schema validation on following cases: if a producer without a schema attempts to produce to a topic with schema, the producer will be failed to connect. PLEASE be carefully on using this, since non-java clients don't support schema. If this setting is enabled, then non-java clients fail to produce.|false| +| isAllowAutoUpdateSchemaEnabled | Allow schema to be auto updated at broker level. User can override this by 'is_allow_auto_update_schema' of namespace policy.
    **Note:** This configuration is only available in 2.8.3 and later versions. |true| +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +|offloadersDirectory|The directory for all the offloader implementations.|./offloaders| +|bookkeeperMetadataServiceUri| Metadata service uri that bookkeeper is used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: zk+hierarchical://localhost:2181/ledgers. The metadata service uri list can also be semicolon separated values like below: zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperNumberOfChannelsPerBookie| Number of channels per bookie |16| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientRegionawarePolicyEnabled| Enable region-aware bookie selection policy. BK will chose bookies from different regions and racks when forming a new bookie ensemble. If enabled, the value of bookkeeperClientRackawarePolicyEnabled is ignored |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| Minimum number of racks per write quorum. BK rack-aware bookie selection policy will try to get bookies from at least 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a write quorum. |2| +|bookkeeperClientEnforceMinNumRacksPerWriteQuorum| Enforces rack-aware bookie selection policy to pick bookies from 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a writeQuorum. If BK can't find bookie then it would throw BKNotEnoughBookiesException instead of picking random one. |false| +|bookkeeperClientReorderReadSequenceEnabled| Enable/disable reordering read sequence on reading entries. |false| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +|bookkeeperClientGetBookieInfoIntervalSeconds| Set the interval to periodically check bookie info |86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds| Set the interval to retry a failed bookie info lookup |60| +|bookkeeperEnableStickyReads | Enable/disable having read operations for a ledger to be sticky to a single bookie. If this flag is enabled, the client will use one single bookie (by preference) to read all entries for a ledger. | true | +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker. By default, uses 1/5th of available direct memory || +|managedLedgerCacheCopyEntries| Whether we should make a copy of the entry payloads when inserting in cache| false| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| The max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered after the min rollover time has passed and one of the following conditions is true:
    • The max rollover time has been reached
    • The max entries have been written to the ledger
    • The max ledger size has been written to the ledger
    |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more than once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quota |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages |false| +|brokerServicePurgeInactiveFrequencyInSeconds|Deprecated. Use `brokerDeleteInactiveTopicsFrequencySeconds`.|60| +|transactionCoordinatorEnabled|Whether to enable transaction coordinator in broker.|true| +|transactionMetadataStoreProviderClassName| |org.apache.pulsar.transaction.coordinator.impl.InMemTransactionMetadataStoreProvider| +|defaultRetentionTimeInMinutes| Default message retention time |0| +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|bootstrapNamespaces| The bootstrap name. | N/A | +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|supportedNamespaceBundleSplitAlgorithms| Supported algorithms name for namespace bundle split |[range_equally_divide,topic_count_equally_divide]| +|defaultNamespaceBundleSplitAlgorithm| Default algorithm name for namespace bundle split |range_equally_divide| +|managedLedgerOffloadDriver| The directory for all the offloader implementations `offloadersDirectory=./offloaders`. Driver to use to offload old data to long term storage (Possible values: S3, aws-s3, google-cloud-storage). When using google-cloud-storage, Make sure both Google Cloud Storage and Google Cloud Storage JSON API are enabled for the project (check from Developers Console -> Api&auth -> APIs). || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|managedLedgerOffloadPrefetchRounds|The maximum prefetch rounds for ledger reading for offloading.|1| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerOffloadDeletionLagMs|Delay between a ledger being successfully offloaded to long term storage and the ledger being deleted from bookkeeper | 14400000| +|managedLedgerOffloadAutoTriggerSizeThresholdBytes|The number of bytes before triggering automatic offload to long term storage |-1 (disabled)| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| +|gcsManagedLedgerOffloadRegion|For Google Cloud Storage ledger offload, region where offload bucket is located. Go to this page for more details: https://cloud.google.com/storage/docs/bucket-locations .|N/A| +|gcsManagedLedgerOffloadBucket|For Google Cloud Storage ledger offload, Bucket to place offloaded ledger into.|N/A| +|gcsManagedLedgerOffloadMaxBlockSizeInBytes|For Google Cloud Storage ledger offload, the maximum block size in bytes. (64MB by default, 5MB minimum)|67108864| +|gcsManagedLedgerOffloadReadBufferSizeInBytes|For Google Cloud Storage ledger offload, Read buffer size in bytes. (1MB by default)|1048576| +|gcsManagedLedgerOffloadServiceAccountKeyFile|For Google Cloud Storage, path to json file containing service account credentials. For more details, see the "Service Accounts" section of https://support.google.com/googleapi/answer/6158849 .|N/A| +|fileSystemProfilePath|For File System Storage, file system profile path.|../conf/filesystem_offload_core_site.xml| +|fileSystemURI|For File System Storage, file system uri.|N/A| +|s3ManagedLedgerOffloadRole| For Amazon S3 ledger offload, provide a role to assume before writing to s3 || +|s3ManagedLedgerOffloadRoleSessionName| For Amazon S3 ledger offload, provide a role session name when using a role |pulsar-s3-offload| +| acknowledgmentAtBatchIndexLevelEnabled | Enable or disable the batch index acknowledgement. | false | +|enableReplicatedSubscriptions|Whether to enable tracking of replicated subscriptions state across clusters.|true| +|replicatedSubscriptionsSnapshotFrequencyMillis|The frequency of snapshots for replicated subscriptions tracking.|1000| +|replicatedSubscriptionsSnapshotTimeoutSeconds|The timeout for building a consistent snapshot for tracking replicated subscriptions state.|30| +|replicatedSubscriptionsSnapshotMaxCachedPerSubscription|The maximum number of snapshot to be cached per subscription.|10| +|maxMessagePublishBufferSizeInMB|The maximum memory size for a broker to handle messages that are sent by producers. If the processing message size exceeds this value, the broker stops reading data from the connection. The processing messages refer to the messages that are sent to the broker but the broker has not sent response to the client. Usually the messages are waiting to be written to bookies. It is shared across all the topics running in the same broker. The value `-1` disables the memory limitation. By default, it is 50% of direct memory.|N/A| +|messagePublishBufferCheckIntervalInMillis|Interval between checks to see if message publish buffer size exceeds the maximum. Use `0` or negative number to disable the max publish buffer limiting.|100| +|retentionCheckIntervalInSeconds|Check between intervals to see if consumed ledgers need to be trimmed. Use 0 or negative number to disable the check.|120| +| maxMessageSize | Set the maximum size of a message. | 5242880 | +| preciseTopicPublishRateLimiterEnable | Enable precise topic publish rate limiting. | false | +| lazyCursorRecovery | Whether to recover cursors lazily when trying to recover a managed ledger backing a persistent topic. It can improve write availability of topics. The caveat is now when recovered ledger is ready to write we're not sure if all old consumers' last mark delete position(ack position) can be recovered or not. So user can make the trade off or have custom logic in application to checkpoint consumer state.| false | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +|subscriptionTypesEnabled| Enable all subscription types, which are exclusive, shared, failover, and key_shared. | Exclusive, Shared, Failover, Key_Shared | +|narExtractionDirectory | The extraction directory of the nar package.
    Available for Protocol Handler, Additional Servlets, Offloaders, Broker Interceptor. | System.getProperty("java.io.tmpdir") | +| managedLedgerInfoCompressionType | Compression type of `ManagedLedgerInfo`.
    **Note:** This configuration is only available in 2.8.1 and later versions.
    Available values are `NONE`, `LZ4`, `ZLIB`, `ZSTD`, and `SNAPPY`.
    If the value is `NONE` or invalid, the `managedLedgerInfo` is not compressed.
    **Note** that after enabling this configuration, if you want to degrade broker, you need to change the value to `NONE` and ensure all ledger metadata are saved without compression before starting to degrade. | NONE | + +#### Configuration override for clients internal to broker + +In 2.8.4 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients and Pulsar Admin Clients. These configurations are applied after hard coded configuration and before the above broker client configurations named above.| +|bookkeeper_| Configure the broker's BookKeeper clients used by managed ledgers and the BookkeeperPackagesStorage bookkeeper client. Takes precedence over most other configuration values.| + +:::note + +When running the function worker within the broker, these prefixed configurations do not apply to any of those clients. You must configure those clients using the `functions_worker.yml` file. + +::: + +## Client + +You can use the [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool to publish messages to and consume messages from Pulsar topics. You can use this tool in place of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether to enforce the TLS authentication in the cluster. |false| +| tlsAllowInsecureConnection | Allow TLS connections to servers whose certificate cannot be verified to have been signed by a trusted certificate authority. | false | +| tlsEnableHostnameVerification | Whether the server hostname must match the common name of the certificate that is used by the server. | false | +|tlsTrustCertsFilePath||| +| useKeyStoreTls | Enable TLS with KeyStore type configuration in the broker. | false | +| tlsTrustStoreType | TLS TrustStore type configuration.
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path. | | +| tlsTrustStorePassword | TLS TrustStore password. | | + + +## Service discovery + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| Zookeeper quorum connection string (comma-separated) || +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout |30000| +|servicePort| Port to use to server binary-proto request |6650| +|servicePortTls| Port to use to server binary-proto-tls request |6651| +|webServicePort| Port that discovery service listen on |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|bindOnLocalhost| Control whether to bind directly on localhost rather than on normal hostname |false| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names (comma-separated) || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics (comma-separated) || +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || + + + +## Log4j + +You can set the log level and configuration in the [log4j2.yaml](https://github.com/apache/pulsar/blob/d557e0aa286866363bc6261dec87790c055db1b0/conf/log4j2.yaml#L155) file. The following logging configuration parameters are available. + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + +:::note + +'topic' in log4j2.appender is configurable. +- If you want to append all logs to a single topic, set the same topic name. +- If you want to append logs to different topics, you can set different topic names. + +::: + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|zookeeperServers| The quorum connection string for local ZooKeeper || +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| The port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +| numAcceptorThreads | Number of threads to use for Netty Acceptor | 1 | +| numIOThreads | Number of threads to use for Netty IO | 2 * Runtime.getRuntime().availableProcessors() | +| numHttpServerThreads | Number of threads to use for HTTP requests processing | 2 * Runtime.getRuntime().availableProcessors()| +|isRunningStandalone|This flag controls features that are meant to be used when running in standalone mode.|N/A| +|clusterName| The name of the cluster that this broker belongs to. |standalone| +| failureDomainsEnabled | Enable cluster's failure-domain which can distribute brokers into logical region. | false | +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. |30000| +|zooKeeperOperationTimeoutSeconds|ZooKeeper operation timeout in seconds.|30| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitGB| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. |-1| +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`.|true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +| maxPendingPublishRequestsPerConnection | Maximum pending publish requests per connection to avoid keeping large number of pending requests in memory | 1000| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +| subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consumption. When it is set to 0, inactive subscriptions are not deleted automatically | 0 | +| subscriptionRedeliveryTrackerEnabled | Enable subscription message redelivery tracker to send redelivery count to consumer. | true | +|subscriptionKeySharedEnable|Whether to enable the Key_Shared subscription.|true| +| subscriptionKeySharedUseConsistentHashing | In Key_Shared subscription type, with default AUTO_SPLIT mode, use splitting ranges or consistent hashing to reassign keys to new consumers. | false | +| subscriptionKeySharedConsistentHashingReplicaPoints | In Key_Shared subscription type, the number of points in the consistent-hashing ring. The greater the number, the more equal the assignment of keys to consumers. | 100 | +| subscriptionExpiryCheckIntervalInMinutes | How frequently to proactively check and purge expired subscription |5 | +| brokerDeduplicationEnabled | Set the default behavior for message deduplication in the broker. This can be overridden per-namespace. If it is enabled, the broker rejects messages that are already stored in the topic. | false | +| brokerDeduplicationMaxNumberOfProducers | Maximum number of producer information that it's going to be persisted for deduplication purposes | 10000 | +| brokerDeduplicationEntriesInterval | Number of entries after which a deduplication information snapshot is taken. A greater interval leads to less snapshots being taken though it would increase the topic recovery time, when the entries published after the snapshot need to be replayed. | 1000 | +| brokerDeduplicationProducerInactivityTimeoutMinutes | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | 360 | +| defaultNumberOfNamespaceBundles | When a namespace is created without specifying the number of bundles, this value is used as the default setting.| 4 | +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +| maxUnackedMessagesPerBroker | Maximum number of unacknowledged messages allowed per broker. Once this limit reaches, the broker stops dispatching messages to all shared subscriptions which has a higher number of unacknowledged messages until subscriptions start acknowledging messages back and unacknowledged messages count reaches to limit/2. When the value is set to 0, unacknowledged message limit check is disabled and broker does not block dispatchers. | 0 | +| maxUnackedMessagesPerSubscriptionOnBrokerBlocked | Once the broker reaches maxUnackedMessagesPerBroker limit, it blocks subscriptions which have higher unacknowledged messages than this percentage limit and subscription does not receive any new messages until that subscription acknowledges messages back. | 0.16 | +| unblockStuckSubscriptionEnabled|Broker periodically checks if subscription is stuck and unblock if flag is enabled.|false| +|zookeeperSessionExpiredPolicy|There are two policies when ZooKeeper session expired happens, "shutdown" and "reconnect". If it is set to "shutdown" policy, when ZooKeeper session expired happens, the broker is shutdown. If it is set to "reconnect" policy, the broker tries to reconnect to ZooKeeper server and re-register metadata to ZooKeeper. Note: the "reconnect" policy is an experiment feature.|shutdown| +| topicPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks topic publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. (Disable publish throttling with value 0) | 10| +| brokerPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks broker publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. When the value is set to 0, publish throttling is disabled. |50 | +| brokerPublisherThrottlingMaxMessageRate | Maximum rate (in 1 second) of messages allowed to publish for a broker if the message rate limiting is enabled. When the value is set to 0, message rate limiting is disabled. | 0| +| brokerPublisherThrottlingMaxByteRate | Maximum rate (in 1 second) of bytes allowed to publish for a broker if the byte rate limiting is enabled. When the value is set to 0, the byte rate limiting is disabled. | 0 | +|subscribeThrottlingRatePerConsumer|Too many subscribe requests from a consumer can cause broker rewinding consumer cursors and loading data from bookies, hence causing high network bandwidth usage. When the positive value is set, broker will throttle the subscribe requests for one consumer. Otherwise, the throttling will be disabled. By default, throttling is disabled.|0| +|subscribeRatePeriodPerConsumerInSecond|Rate period for {subscribeThrottlingRatePerConsumer}. By default, it is 30s.|30| +| dispatchThrottlingRatePerTopicInMsg | Default messages (per second) dispatch throttling-limit for every topic. When the value is set to 0, default message dispatch throttling-limit is disabled. |0 | +| dispatchThrottlingRatePerTopicInByte | Default byte (per second) dispatch throttling-limit for every topic. When the value is set to 0, default byte dispatch throttling-limit is disabled. | 0| +| dispatchThrottlingRateRelativeToPublishRate | Enable dispatch rate-limiting relative to publish rate. | false | +|dispatchThrottlingRatePerSubscriptionInMsg|The defaulted number of message dispatching throttling-limit for a subscription. The value of 0 disables message dispatch-throttling.|0| +|dispatchThrottlingRatePerSubscriptionInByte|The default number of message-bytes dispatching throttling-limit for a subscription. The value of 0 disables message-byte dispatch-throttling.|0| +| dispatchThrottlingOnNonBacklogConsumerEnabled | Enable dispatch-throttling for both caught up consumers as well as consumers who have backlogs. | true | +|dispatcherMaxReadBatchSize|The maximum number of entries to read from BookKeeper. By default, it is 100 entries.|100| +|dispatcherMaxReadSizeBytes|The maximum size in bytes of entries to read from BookKeeper. By default, it is 5MB.|5242880| +|dispatcherMinReadBatchSize|The minimum number of entries to read from BookKeeper. By default, it is 1 entry. When there is an error occurred on reading entries from bookkeeper, the broker will backoff the batch size to this minimum number.|1| +|dispatcherMaxRoundRobinBatchSize|The maximum number of entries to dispatch for a shared subscription. By default, it is 20 entries.|20| +| preciseDispatcherFlowControl | Precise dispathcer flow control according to history message number of each entry. | false | +| streamingDispatch | Whether to use streaming read dispatcher. It can be useful when there's a huge backlog to drain and instead of read with micro batch we can streamline the read from bookkeeper to make the most of consumer capacity till we hit bookkeeper read limit or consumer process limit, then we can use consumer flow control to tune the speed. This feature is currently in preview and can be changed in subsequent release. | false | +| maxConcurrentLookupRequest | Maximum number of concurrent lookup request that the broker allows to throttle heavy incoming lookup traffic. | 50000 | +| maxConcurrentTopicLoadRequest | Maximum number of concurrent topic loading request that the broker allows to control the number of zk-operations. | 5000 | +| maxConcurrentNonPersistentMessagePerConnection | Maximum number of concurrent non-persistent message that can be processed per connection. | 1000 | +| numWorkerThreadsForNonPersistentTopic | Number of worker threads to serve non-persistent topic. | 8 | +| enablePersistentTopics | Enable broker to load persistent topics. | true | +| enableNonPersistentTopics | Enable broker to load non-persistent topics. | true | +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxNumPartitionsPerPartitionedTopic | Maximum number of partitions per partitioned topic. When the value is set to a negative number or is set to 0, the check is disabled. | 0 | +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. When the value is set to 0, check the TLS certificate on every new connection. | 300 | +| tlsCertificateFilePath | Path for the TLS certificate file. | | +| tlsKeyFilePath | Path for the TLS private key file. | | +| tlsTrustCertsFilePath | Path for the trusted TLS certificate file.| | +| tlsAllowInsecureConnection | Accept untrusted TLS certificate from the client. If it is set to true, a client with a certificate which cannot be verified with the 'tlsTrustCertsFilePath' certificate is allowed to connect to the server, though the certificate is not be used for client authentication. | false | +| tlsProtocols | Specify the TLS protocols the broker uses to negotiate during TLS handshake. | | +| tlsCiphers | Specify the TLS cipher the broker uses to negotiate during TLS Handshake. | | +| tlsRequireTrustedClientCertOnConnect | Trusted client certificates are required for to connect TLS. Reject the Connection if the client certificate is not trusted. In effect, this requires that all connecting clients perform TLS client authentication. | false | +| tlsEnabledWithKeyStore | Enable TLS with KeyStore type configuration in broker. | false | +| tlsProvider | TLS Provider for KeyStore type. | | +| tlsKeyStoreType | TLS KeyStore type configuration in the broker.
  • JKS
  • PKCS12
  • |JKS| +| tlsKeyStore | TLS KeyStore path in the broker. | | +| tlsKeyStorePassword | TLS KeyStore password for the broker. | | +| tlsTrustStoreType | TLS TrustStore type configuration in the broker
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path in the broker. | | +| tlsTrustStorePassword | TLS TrustStore password for the broker. | | +| brokerClientTlsEnabledWithKeyStore | Configure whether the internal client uses the KeyStore type to authenticate with Pulsar brokers. | false | +| brokerClientSslProvider | The TLS Provider used by the internal client to authenticate with other Pulsar brokers. | | +| brokerClientTlsTrustStoreType | TLS TrustStore type configuration for the internal client to authenticate with Pulsar brokers.
  • JKS
  • PKCS12
  • | JKS | +| brokerClientTlsTrustStore | TLS TrustStore path for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsTrustStorePassword | TLS TrustStore password for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsCiphers | Specify the TLS cipher that the internal client uses to negotiate during TLS Handshake. | | +| brokerClientTlsProtocols | Specify the TLS protocols that the broker uses to negotiate during TLS handshake. | | +| systemTopicEnabled | Enable/Disable system topics. | false | +| topicLevelPoliciesEnabled | Enable or disable topic level policies. Topic level policies depends on the system topic. Please enable the system topic first. | false | +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +| proxyRoles | Role names that are treated as "proxy roles". If the broker sees a request with role as proxyRoles, it demands to see a valid original principal. | | +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +| authorizationProvider | Authorization provider fully qualified class-name. | org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider | +| authorizationAllowWildcardsMatching | Allow wildcard matching in authorization. Wildcard matching is applicable only when the wildcard-character (*) presents at the **first** or **last** position. | false | +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. | | +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. | | +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. | | +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. | | +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|saslJaasClientAllowedIds|This is a regexp, which limits the range of possible ids which can connect to the Broker using SASL. By default, it is set to `SaslConstants.JAAS_CLIENT_ALLOWED_IDS_DEFAULT`, which is ".*pulsar.*", so only clients whose id contains 'pulsar' are allowed to connect.|N/A| +|saslJaasBrokerSectionName|Service Principal, for login context name. By default, it is set to `SaslConstants.JAAS_DEFAULT_BROKER_SECTION_NAME`, which is "Broker".|N/A| +|httpMaxRequestSize|If the value is larger than 0, it rejects all HTTP requests with bodies larged than the configured limit.|-1| +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|bookkeeperMetadataServiceUri|Metadata service uri is what BookKeeper used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: `zk+hierarchical://localhost:2181/ledgers`. The metadata service uri list can also be semicolon separated values like: `zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers`.|N/A| +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperUseV2WireProtocol|Use older Bookkeeper wire protocol with bookie.|true| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientGetBookieInfoIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|60| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientRegionawarePolicyEnabled| |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| |2| +|bookkeeperClientMinNumRacksPerWriteQuorum| |false| +|bookkeeperClientReorderReadSequenceEnabled| |false| +|bookkeeperClientIsolationGroups||| +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +| bookkeeperTLSProviderFactoryClass | Set the client security provider factory class name. | org.apache.bookkeeper.tls.TLSContextFactory | +| bookkeeperTLSClientAuthentication | Enable TLS authentication with bookie. | false | +| bookkeeperTLSKeyFileType | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSTrustCertTypes | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSKeyStorePasswordPath | Path to file containing keystore password, if the client keystore is password protected. | | +| bookkeeperTLSTrustStorePasswordPath | Path to file containing truststore password, if the client truststore is password protected. | | +| bookkeeperTLSKeyFilePath | Path for the TLS private key file. | | +| bookkeeperTLSCertificateFilePath | Path for the TLS certificate file. | | +| bookkeeperTLSTrustCertsFilePath | Path for the trusted TLS certificate file. | | +| bookkeeperDiskWeightBasedPlacementEnabled | Enable/Disable disk weight based placement. | false | +| bookkeeperExplicitLacIntervalInMills | Set the interval to check the need for sending an explicit LAC. When the value is set to 0, no explicit LAC is sent. | 0 | +| bookkeeperClientExposeStatsToPrometheus | Expose BookKeeper client managed ledger stats to Prometheus. | false | +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +| managedLedgerDigestType | Default type of checksum to use when writing to BookKeeper. | CRC32C | +| managedLedgerNumSchedulerThreads | Number of threads to be used for managed ledger scheduled tasks. | 8 | +|managedLedgerCacheSizeMB| |N/A| +|managedLedgerCacheCopyEntries| Whether to copy the entry payloads when inserting in cache.| false| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +| managedLedgerMaxSizePerLedgerMbytes | Maximum ledger size before triggering a rollover for a topic. | 2048 | +| managedLedgerMaxUnackedRangesToPersist | Maximum number of "acknowledgment holes" that are going to be persistently stored. When acknowledging out of order, a consumer leaves holes that are supposed to be quickly filled by acknowledging all the messages. The information of which messages are acknowledged is persisted by compressing in "ranges" of messages that were acknowledged. After the max number of ranges is reached, the information is only tracked in memory and messages are redelivered in case of crashes. | 10000 | +| managedLedgerMaxUnackedRangesToPersistInZooKeeper | Maximum number of "acknowledgment holes" that can be stored in Zookeeper. If the number of unacknowledged message range is higher than this limit, the broker persists unacknowledged ranges into bookkeeper to avoid additional data overhead into Zookeeper. | 1000 | +|autoSkipNonRecoverableData| |false| +| managedLedgerMetadataOperationsTimeoutSeconds | Operation timeout while updating managed-ledger metadata. | 60 | +| managedLedgerReadEntryTimeoutSeconds | Read entries timeout when the broker tries to read messages from BookKeeper. | 0 | +| managedLedgerAddEntryTimeoutSeconds | Add entry timeout when the broker tries to publish message to BookKeeper. | 0 | +| managedLedgerNewEntriesCheckDelayInMillis | New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor tries to check again after the delay time. For consumption latency sensitive scenario, you can set the value to a smaller value or 0. Of course, a smaller value may degrade consumption throughput.|10 ms| +| managedLedgerPrometheusStatsLatencyRolloverSeconds | Managed ledger prometheus stats latency rollover seconds. | 60 | +| managedLedgerTraceTaskExecution | Whether to trace managed ledger task execution time. | true | +|managedLedgerNewEntriesCheckDelayInMillis|New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor will try to check again after the delay time. For consumption latency sensitive scenario, it can be set to a smaller value or 0. A smaller value degrades consumption throughput. By default, it is 10ms.|10| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +| loadBalancerAutoUnloadSplitBundlesEnabled | Enable/Disable automatic unloading of split bundles. | true | +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +| loadBalancerBrokerThresholdShedderPercentage | The broker resource usage threshold. When the broker resource usage is greater than the pulsar cluster average resource usage, the threshold shedder is triggered to offload bundles from the broker. It only takes effect in the ThresholdShedder strategy. | 10 | +| loadBalancerHistoryResourcePercentage | The history usage when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 0.9 | +| loadBalancerBandwithInResourceWeight | The BandWithIn usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBandwithOutResourceWeight | The BandWithOut usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerCPUResourceWeight | The CPU usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerMemoryResourceWeight | The heap memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerDirectMemoryResourceWeight | The direct memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBundleUnloadMinThroughputThreshold | Bundle unload minimum throughput threshold. Avoid bundle unload frequently. It only takes effect in the ThresholdShedder strategy. | 10 | +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +| replicationPolicyCheckDurationSeconds | Duration to check replication policy to avoid replicator inconsistency due to missing ZooKeeper watch. When the value is set to 0, disable checking replication policy. | 600 | +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for a bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`).

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|/| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +| isAllowAutoUpdateSchemaEnabled | Allow schema to be auto updated at broker level. User can override this by 'is_allow_auto_update_schema' of namespace policy.
    **Note:** This configuration is only available in 2.8.3 and later versions. |true| + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|configurationStoreServers ||| +|zooKeeperSessionTimeoutMillis| |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + +#### Configuration Override For Clients Internal to WebSocket + +In 2.8.4 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +## Pulsar proxy + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|forwardAuthorizationCredentials| Forward client authorization credentials to Broker for re-authorization, and make sure authentication is enabled for this to take effect. |false| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +| brokerServiceURL | The service URL pointing to the broker cluster. Must begin with `pulsar://`. | | +| brokerServiceURLTLS | The TLS service URL pointing to the broker cluster. Must begin with `pulsar+ssl://`. | | +| brokerWebServiceURL | The Web service URL pointing to the broker cluster | | +| brokerWebServiceURLTLS | The TLS Web service URL pointing to the broker cluster | | +| functionWorkerWebServiceURL | The Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +| functionWorkerWebServiceURLTLS | The TLS Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|advertisedAddress|Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostname()` is used.|N/A| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +| proxyLogLevel | Proxy log level
  • 0: Do not log any TCP channel information.
  • 1: Parse and log any TCP channel information and command information without message body.
  • 2: Parse and log channel information, command information and message body.
  • | 0 | +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |50000| +|tlsEnabledInProxy| Deprecated - use `servicePortTls` and `webServicePortTls` instead. |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers. |false| +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. If the value is set 0, check TLS certificate every new connection. | 300 | +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +| httpReverseProxyConfigs | HTTP directs to redirect to non-pulsar services | | +| httpOutputBufferSize | HTTP output buffer size. The amount of data that will be buffered for HTTP requests before it is flushed to the channel. A larger buffer size may result in higher HTTP throughput though it may take longer for the client to see data. If using HTTP streaming via the reverse proxy, this should be set to the minimum value (1) so that clients see the data as soon as possible. | 32768 | +| httpNumThreads | Number of threads to use for HTTP requests processing| 2 * Runtime.getRuntime().availableProcessors() | +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| numIOThreads | Number of threads used for Netty IO.
    **Note:** This configuration is only available in 2.8.4 and later versions. | 2 * Runtime.getRuntime().availableProcessors() | +| numAcceptorThreads | Number of threads used for Netty Acceptor.
    **Note:** This configuration is only available in 2.8.4 and later versions.| 1 | + +#### Configuration Override For Clients Internal to Proxy + +In 2.8.4 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the proxy's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|admin.enableServer|The port at which the admin listens.|true| +|admin.serverPort|The port at which the admin listens.|9990| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|forceSync|Requires updates to be synced to media of the transaction log before finishing processing the update. If this option is set to 'no', ZooKeeper will not require updates to be synced to the media. WARNING: it's not recommended to run a production ZK cluster with `forceSync` disabled.|yes| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration diff --git a/site2/website/versioned_docs/version-2.8.x/reference-connector-admin.md b/site2/website/versioned_docs/version-2.8.x/reference-connector-admin.md new file mode 100644 index 0000000000000..7b73ae80750cd --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-connector-admin.md @@ -0,0 +1,11 @@ +--- +id: reference-connector-admin +title: Connector Admin CLI +sidebar_label: "Connector Admin CLI" +original_id: reference-connector-admin +--- + +> **Important** +> +> For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more information, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). +> \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/reference-metrics.md b/site2/website/versioned_docs/version-2.8.x/reference-metrics.md new file mode 100644 index 0000000000000..b114cc264f99c --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-metrics.md @@ -0,0 +1,567 @@ +--- +id: reference-metrics +title: Pulsar Metrics +sidebar_label: "Pulsar Metrics" +original_id: reference-metrics +--- + + + +Pulsar exposes the following metrics in Prometheus format. You can monitor your clusters with those metrics. + +* [ZooKeeper](#zookeeper) +* [BookKeeper](#bookkeeper) +* [Broker](#broker) +* [Pulsar Functions](#pulsar-functions) +* [Proxy](#proxy) +* [Pulsar SQL Worker](#pulsar-sql-worker) +* [Pulsar transaction](#pulsar-transaction) + +The following types of metrics are available: + +- [Counter](https://prometheus.io/docs/concepts/metric_types/#counter): a cumulative metric that represents a single monotonically increasing counter. The value increases by default. You can reset the value to zero or restart your cluster. +- [Gauge](https://prometheus.io/docs/concepts/metric_types/#gauge): a metric that represents a single numerical value that can arbitrarily go up and down. +- [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram): a histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. The `_bucket` suffix is the number of observations within a histogram bucket, configured with parameter `{le=""}`. The `_count` suffix is the number of observations, shown as a time series and behaves like a counter. The `_sum` suffix is the sum of observed values, also shown as a time series and behaves like a counter. These suffixes are together denoted by `_*` in this doc. +- [Summary](https://prometheus.io/docs/concepts/metric_types/#summary): similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window. + +## ZooKeeper + +The ZooKeeper metrics are exposed under "/metrics" at port `8000`. You can use a different port by configuring the `metricsProvider.httpPort` in conf/zookeeper.conf. + +ZooKeeper provides a New Metrics System since 3.6.0. For more detailed metrics, refer to the [ZooKeeper Monitor Guide](https://zookeeper.apache.org/doc/r3.7.0/zookeeperMonitor.html). + +## BookKeeper + +The BookKeeper metrics are exposed under "/metrics" at port `8000`. You can change the port by updating `prometheusStatsHttpPort` +in the `bookkeeper.conf` configuration file. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| bookie_SERVER_STATUS | Gauge | The server status for bookie server.
    • 1: the bookie is running in writable mode.
    • 0: the bookie is running in readonly mode.
    | +| bookkeeper_server_ADD_ENTRY_count | Counter | The total number of ADD_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_count | Counter | The total number of READ_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_WRITE_BYTES | Counter | The total number of bytes written to the bookie. | +| bookie_READ_BYTES | Counter | The total number of bytes read from the bookie. | +| bookkeeper_server_ADD_ENTRY_REQUEST | Summary | The summary of request latency of ADD_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_REQUEST | Summary | The summary of request latency of READ_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | + +### Journal metrics + +| Name | Type | Description | +|---|---|---| +| bookie_journal_JOURNAL_SYNC_count | Counter | The total number of journal fsync operations happening at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_journal_JOURNAL_QUEUE_SIZE | Gauge | The total number of requests pending in the journal queue. | +| bookie_journal_JOURNAL_FORCE_WRITE_QUEUE_SIZE | Gauge | The total number of force write (fsync) requests pending in the force-write queue. | +| bookie_journal_JOURNAL_CB_QUEUE_SIZE | Gauge | The total number of callbacks pending in the callback queue. | +| bookie_journal_JOURNAL_ADD_ENTRY | Summary | The summary of request latency of adding entries to the journal. | +| bookie_journal_JOURNAL_SYNC | Summary | The summary of fsync latency of syncing data to the journal disk. | + +### Storage metrics + +| Name | Type | Description | +|---|---|---| +| bookie_ledgers_count | Gauge | The total number of ledgers stored in the bookie. | +| bookie_entries_count | Gauge | The total number of entries stored in the bookie. | +| bookie_write_cache_size | Gauge | The bookie write cache size (in bytes). | +| bookie_read_cache_size | Gauge | The bookie read cache size (in bytes). | +| bookie_DELETED_LEDGER_COUNT | Counter | The total number of ledgers deleted since the bookie has started. | +| bookie_ledger_writable_dirs | Gauge | The number of writable directories in the bookie. | + +## Broker + +The broker metrics are exposed under "/metrics" at port `8080`. You can change the port by updating `webServicePort` to a different port +in the `broker.conf` configuration file. + +All the metrics exposed by a broker are labelled with `cluster=${pulsar_cluster}`. The name of Pulsar cluster is the value of `${pulsar_cluster}`, which you have configured in the `broker.conf` file. + +The following metrics are available for broker: + +- [ZooKeeper](#zookeeper) + - [Server metrics](#server-metrics) + - [Request metrics](#request-metrics) +- [BookKeeper](#bookkeeper) + - [Server metrics](#server-metrics-1) + - [Journal metrics](#journal-metrics) + - [Storage metrics](#storage-metrics) +- [Broker](#broker) + - [Namespace metrics](#namespace-metrics) + - [Replication metrics](#replication-metrics) + - [Topic metrics](#topic-metrics) + - [Replication metrics](#replication-metrics-1) + - [ManagedLedgerCache metrics](#managedledgercache-metrics) + - [ManagedLedger metrics](#managedledger-metrics) + - [LoadBalancing metrics](#loadbalancing-metrics) + - [BundleUnloading metrics](#bundleunloading-metrics) + - [BundleSplit metrics](#bundlesplit-metrics) + - [Subscription metrics](#subscription-metrics) + - [Consumer metrics](#consumer-metrics) + - [Managed ledger bookie client metrics](#managed-ledger-bookie-client-metrics) + - [Token metrics](#token-metrics) + - [Authentication metrics](#authentication-metrics) + - [Connection metrics](#connection-metrics) + - [Jetty metrics](#jetty-metrics) +- [Pulsar Functions](#pulsar-functions) +- [Proxy](#proxy) +- [Pulsar SQL Worker](#pulsar-sql-worker) +- [Pulsar transaction](#pulsar-transaction) + +### Namespace metrics + +> Namespace metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `false`. + +All the namespace metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_topics_count | Gauge | The number of Pulsar topics of the namespace owned by this broker. | +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the namespace served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the namespace connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the namespace connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the namespace coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the namespace going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the namespace coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the namespace going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this namespace owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size (without replicas) of the topics in this namespace owned by this broker (bytes).
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this namespace owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this namespace offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this namespace (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this namespace (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a namespace that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a namespace that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | + +#### Replication metrics + +If a namespace is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are also labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the namespace replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the namespace replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the namespace replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the namespace replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the namespace replicating to remote cluster (messages). | + +### Topic metrics + +> Topic metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the topic metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the topic served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the topic connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the topic connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the topic coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the topic going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the topic coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the topic going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this topic owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size (without replicas) of the topics in this namespace owned by this broker (bytes).
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this topic owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this topic offloaded to the tiered storage (bytes). | +| pulsar_storage_backlog_quota_limit | Gauge | The total amount of the data in this topic that limit the backlog quota (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this topic (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this topic (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a topic that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a topic that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | +| pulsar_in_bytes_total | Counter | The total number of bytes received for this topic | +| pulsar_in_messages_total | Counter | The total number of messages received for this topic | +| pulsar_out_bytes_total | Counter | The total number of bytes read from this topic | +| pulsar_out_messages_total | Counter | The total number of messages read from this topic | +| pulsar_compaction_removed_event_count | Gauge | The removed event count of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_succeed_count | Gauge | The succeed count of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_failed_count | Gauge | The failed count of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_duration_time_in_mills | Gauge | The duration time of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_read_throughput | Gauge | The read throughput of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_write_throughput | Gauge | The write throughput of compaction.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_latency_le_* | Histogram | The compaction latency with given quantile.
    **Note:** This metric is only available in 2.8.1 and later versions.
    Available thresholds:
    • pulsar_compaction_latency_le_0_5: <= 0.5ms
    • pulsar_compaction_latency_le_1: <= 1ms
    • pulsar_compaction_latency_le_5: <= 5ms
    • pulsar_compaction_latency_le_10: <= 10ms
    • pulsar_compaction_latency_le_20: <= 20ms
    • pulsar_compaction_latency_le_50: <= 50ms
    • pulsar_compaction_latency_le_100: <= 100ms
    • pulsar_compaction_latency_le_200: <= 200ms
    • pulsar_compaction_latency_le_1000: <= 1s
    • pulsar_compaction_latency_le_overflow: > 1s
    | +| pulsar_compaction_compacted_entries_count | Gauge | The compacted entries count.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_compaction_compacted_entries_size |Gauge | The compacted entries size.
    **Note:** This metric is only available in 2.8.1 and later versions. | + +#### Replication metrics + +If a namespace that a topic belongs to is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the topic replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the topic replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the topic replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the topic replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the topic replicating to remote cluster (messages). | + +### ManagedLedgerCache metrics +All the ManagedLedgerCache metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_cache_evictions | Gauge | The number of cache evictions during the last minute. | +| pulsar_ml_cache_hits_rate | Gauge | The number of cache hits per second. | +| pulsar_ml_cache_hits_throughput | Gauge | The amount of data is retrieved from the cache in byte/s | +| pulsar_ml_cache_misses_rate | Gauge | The number of cache misses per second | +| pulsar_ml_cache_misses_throughput | Gauge | The amount of data is retrieved from the cache in byte/s | +| pulsar_ml_cache_pool_active_allocations | Gauge | The number of currently active allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_huge | Gauge | The number of currently active huge allocation in direct arena | +| pulsar_ml_cache_pool_active_allocations_normal | Gauge | The number of currently active normal allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_small | Gauge | The number of currently active small allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_tiny | Gauge | The number of currently active tiny allocations in direct arena | +| pulsar_ml_cache_pool_allocated | Gauge | The total allocated memory of chunk lists in direct arena | +| pulsar_ml_cache_pool_used | Gauge | The total used memory of chunk lists in direct arena | +| pulsar_ml_cache_used_size | Gauge | The size in byte used to store the entries payloads | +| pulsar_ml_count | Gauge | The number of currently opened managed ledgers | + +### ManagedLedger metrics +All the managedLedger metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- namespace: namespace=${pulsar_namespace}. ${pulsar_namespace} is the namespace name. +- quantile: quantile=${quantile}. Quantile is only for `Histogram` type metric, and represents the threshold for given Buckets. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_AddEntryBytesRate | Gauge | The bytes/s rate of messages added | +| pulsar_ml_AddEntryWithReplicasBytesRate | Gauge | The bytes/s rate of messages added with replicas.
    **Note:** This metric is only available in 2.8.1 and later versions. | +| pulsar_ml_AddEntryErrors | Gauge | The number of addEntry requests that failed | +| pulsar_ml_AddEntryLatencyBuckets | Histogram | The latency of adding a ledger entry with a given quantile (threshold), including time spent on waiting in queue on the broker side
    Available quantile:
    • quantile="0.0_0.5" is AddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is AddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is AddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is AddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is AddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is AddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is AddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is AddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is AddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_AddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the AddEntryLatency is longer than 1 second | +| pulsar_ml_AddEntryMessagesRate | Gauge | The msg/s rate of messages added | +| pulsar_ml_AddEntrySucceed | Gauge | The number of addEntry requests that succeeded | +| pulsar_ml_EntrySizeBuckets | Histogram | The added entry size of a ledger with a given quantile.
    Available quantile:
    • quantile="0.0_128.0" is EntrySize between (0byte, 128byte]
    • quantile="128.0_512.0" is EntrySize between (128byte, 512byte]
    • quantile="512.0_1024.0" is EntrySize between (512byte, 1KB]
    • quantile="1024.0_2048.0" is EntrySize between (1KB, 2KB]
    • quantile="2048.0_4096.0" is EntrySize between (2KB, 4KB]
    • quantile="4096.0_16384.0" is EntrySize between (4KB, 16KB]
    • quantile="16384.0_102400.0" is EntrySize between (16KB, 100KB]
    • quantile="102400.0_1232896.0" is EntrySize between (100KB, 1MB]
    | +| pulsar_ml_EntrySizeBuckets_OVERFLOW |Gauge | The number of times the EntrySize is larger than 1MB | +| pulsar_ml_LedgerSwitchLatencyBuckets | Histogram | The ledger switch latency with a given quantile.
    Available quantile:
    • quantile="0.0_0.5" is EntrySize between (0ms, 0.5ms]
    • quantile="0.5_1.0" is EntrySize between (0.5ms, 1ms]
    • quantile="1.0_5.0" is EntrySize between (1ms, 5ms]
    • quantile="5.0_10.0" is EntrySize between (5ms, 10ms]
    • quantile="10.0_20.0" is EntrySize between (10ms, 20ms]
    • quantile="20.0_50.0" is EntrySize between (20ms, 50ms]
    • quantile="50.0_100.0" is EntrySize between (50ms, 100ms]
    • quantile="100.0_200.0" is EntrySize between (100ms, 200ms]
    • quantile="200.0_1000.0" is EntrySize between (200ms, 1000ms]
    | +| pulsar_ml_LedgerSwitchLatencyBuckets_OVERFLOW | Gauge | The number of times the ledger switch latency is longer than 1 second | +| pulsar_ml_LedgerAddEntryLatencyBuckets | Histogram | The latency for bookie client to persist a ledger entry from broker to BookKeeper service with a given quantile (threshold).
    Available quantile:
    • quantile="0.0_0.5" is LedgerAddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is LedgerAddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is LedgerAddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is LedgerAddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is LedgerAddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is LedgerAddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is LedgerAddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is LedgerAddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is LedgerAddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_LedgerAddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the LedgerAddEntryLatency is longer than 1 second | +| pulsar_ml_MarkDeleteRate | Gauge | The rate of mark-delete ops/s | +| pulsar_ml_NumberOfMessagesInBacklog | Gauge | The number of backlog messages for all the consumers | +| pulsar_ml_ReadEntriesBytesRate | Gauge | The bytes/s rate of messages read | +| pulsar_ml_ReadEntriesErrors | Gauge | The number of readEntries requests that failed | +| pulsar_ml_ReadEntriesRate | Gauge | The msg/s rate of messages read | +| pulsar_ml_ReadEntriesSucceeded | Gauge | The number of readEntries requests that succeeded | +| pulsar_ml_StoredMessagesSize | Gauge | The total size of the messages in active ledgers (accounting for the multiple copies stored) | + +### Managed cursor acknowledgment state + +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgment, you can configure the metrics for the managed cursor. + +All the cursor acknowledgment state metrics are labelled with the following labels: + +- namespace: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +- ledger_name: `ledger_name=${pulsar_ledger_name}`. `${pulsar_ledger_name}` is the ledger name. + +- cursor_name: `ledger_name=${pulsar_cursor_name}`. `${pulsar_cursor_name}` is the cursor name. + +Name |Type |Description +|---|---|--- +brk_ml_cursor_persistLedgerSucceed(namespace=", ledger_name="", cursor_name:")|Gauge|The number of acknowledgment states that is persistent to a ledger.| +brk_ml_cursor_persistLedgerErrors(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to the ledger.| +brk_ml_cursor_persistZookeeperSucceed(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of acknowledgment states that is persistent to ZooKeeper. +brk_ml_cursor_persistZookeeperErrors(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to ZooKeeper. +brk_ml_cursor_nonContiguousDeletedMessagesRange(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of non-contiguous deleted messages ranges. +brk_ml_cursor_writeLedgerSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of write to ledger. **Note:** This metric is only available in 2.8.1 and later versions. +brk_ml_cursor_writeLedgerLogicalSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of write to ledger (accounting for without replicas). **Note:** This metric is only available in 2.8.1 and later versions. +brk_ml_cursor_readLedgerSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of read from ledger. **Note:** This metric is only available in 2.8.1 and later versions. + +### LoadBalancing metrics +All the loadbalancing metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- broker: broker=${broker}. ${broker} is the IP address of the broker +- metric: metric="loadBalancing". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bandwidth_in_usage | Gauge | The broker inbound bandwith usage (in percent). | +| pulsar_lb_bandwidth_out_usage | Gauge | The broker outbound bandwith usage (in percent). | +| pulsar_lb_cpu_usage | Gauge | The broker cpu usage (in percent). | +| pulsar_lb_directMemory_usage | Gauge | The broker process direct memory usage (in percent). | +| pulsar_lb_memory_usage | Gauge | The broker process memory usage (in percent). | + +#### BundleUnloading metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundleUnloading". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_unload_broker_count | Counter | Unload broker count in this bundle unloading | +| pulsar_lb_unload_bundle_count | Counter | Bundle unload count in this bundle unloading | + +#### BundleSplit metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundlesSplit". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bundles_split_count | Counter | bundle split count in this bundle splitting check interval | + +### Subscription metrics + +> Subscription metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the subscription metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscription_back_log | Gauge | The total backlog of a subscription (messages). | +| pulsar_subscription_delayed | Gauge | The total number of messages are delayed to be dispatched for a subscription (messages). | +| pulsar_subscription_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_subscription_unacked_messages | Gauge | The total number of unacknowledged messages of a subscription (messages). | +| pulsar_subscription_blocked_on_unacked_messages | Gauge | Indicate whether a subscription is blocked on unacknowledged messages or not.
    • 1 means the subscription is blocked on waiting unacknowledged messages to be acked.
    • 0 means the subscription is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_subscription_msg_rate_out | Gauge | The total message dispatch rate for a subscription (messages/second). | +| pulsar_subscription_msg_throughput_out | Gauge | The total message dispatch throughput for a subscription (bytes/second). | + +### Consumer metrics + +> Consumer metrics are only exposed when both `exposeTopicLevelMetricsInPrometheus` and `exposeConsumerLevelMetricsInPrometheus` are set to `true`. + +All the consumer metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. +- *consumer_name*: `consumer_name=${consumer_name}`. `${consumer_name}` is the topic consumer name. +- *consumer_id*: `consumer_id=${consumer_id}`. `${consumer_id}` is the topic consumer id. + +| Name | Type | Description | +|---|---|---| +| pulsar_consumer_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_consumer_unacked_messages | Gauge | The total number of unacknowledged messages of a consumer (messages). | +| pulsar_consumer_blocked_on_unacked_messages | Gauge | Indicate whether a consumer is blocked on unacknowledged messages or not.
    • 1 means the consumer is blocked on waiting unacknowledged messages to be acked.
    • 0 means the consumer is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_consumer_msg_rate_out | Gauge | The total message dispatch rate for a consumer (messages/second). | +| pulsar_consumer_msg_throughput_out | Gauge | The total message dispatch throughput for a consumer (bytes/second). | +| pulsar_consumer_available_permits | Gauge | The available permits for for a consumer. | + +### Managed ledger bookie client metrics + +All the managed ledger bookie client metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_completed_tasks_* | Gauge | The number of tasks the scheduler executor execute completed.
    The number of metrics determined by the scheduler executor thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_queue_* | Gauge | The number of tasks queued in the scheduler executor's queue.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_total_tasks_* | Gauge | The total number of tasks the scheduler executor received.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_execution | Summary | The scheduler task execution latency calculated in milliseconds. | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_queued | Summary | The scheduler task queued latency calculated in milliseconds. | + +### Token metrics + +All the token metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +|---|---|---| +| pulsar_expired_token_count | Counter | The number of expired tokens in Pulsar. | +| pulsar_expiring_token_minutes | Histogram | The remaining time of expiring tokens in minutes. | + +### Authentication metrics + +All the authentication metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *provider_name*: `provider_name=${provider_name}`. `${provider_name}` is the class name of the authentication provider. +- *auth_method*: `auth_method=${auth_method}`. `${auth_method}` is the authentication method of the authentication provider. +- *reason*: `reason=${reason}`. `${reason}` is the reason for failing authentication operation. (This label is only for `pulsar_authentication_failures_count`.) + +| Name | Type | Description | +|---|---|---| +| pulsar_authentication_success_count| Counter | The number of successful authentication operations. | +| pulsar_authentication_failures_count | Counter | The number of failing authentication operations. | + +### Connection metrics + +All the connection metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *broker*: `broker=${advertised_address}`. `${advertised_address}` is the advertised address of the broker. +- *metric*: `metric=${metric}`. `${metric}` is the connection metric collective name. + +| Name | Type | Description | +|---|---|---| +| pulsar_active_connections| Gauge | The number of active connections. | +| pulsar_connection_created_total_count | Gauge | The total number of connections. | +| pulsar_connection_create_success_count | Gauge | The number of successfully created connections. | +| pulsar_connection_create_fail_count | Gauge | The number of failed connections. | +| pulsar_connection_closed_total_count | Gauge | The total number of closed connections. | +| pulsar_broker_throttled_connections | Gauge | The number of throttled connections. | +| pulsar_broker_throttled_connections_global_limit | Gauge | The number of throttled connections because of per-connection limit. | + +### Jetty metrics + +> For a functions-worker running separately from brokers, its Jetty metrics are only exposed when `includeStandardPrometheusMetrics` is set to `true`. + +All the jetty metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +|---|---|---| +| jetty_requests_total | Counter | Number of requests. | +| jetty_requests_active | Gauge | Number of requests currently active. | +| jetty_requests_active_max | Gauge | Maximum number of requests that have been active at once. | +| jetty_request_time_max_seconds | Gauge | Maximum time spent handling requests. | +| jetty_request_time_seconds_total | Counter | Total time spent in all request handling. | +| jetty_dispatched_total | Counter | Number of dispatches. | +| jetty_dispatched_active | Gauge | Number of dispatches currently active. | +| jetty_dispatched_active_max | Gauge | Maximum number of active dispatches being handled. | +| jetty_dispatched_time_max | Gauge | Maximum time spent in dispatch handling. | +| jetty_dispatched_time_seconds_total | Counter | Total time spent in dispatch handling. | +| jetty_async_requests_total | Counter | Total number of async requests. | +| jetty_async_requests_waiting | Gauge | Currently waiting async requests. | +| jetty_async_requests_waiting_max | Gauge | Maximum number of waiting async requests. | +| jetty_async_dispatches_total | Counter | Number of requested that have been asynchronously dispatched. | +| jetty_expires_total | Counter | Number of async requests requests that have expired. | +| jetty_responses_total | Counter | Number of responses, labeled by status code. The `code` label can be "1xx", "2xx", "3xx", "4xx", or "5xx". | +| jetty_stats_seconds | Gauge | Time in seconds stats have been collected for. | +| jetty_responses_bytes_total | Counter | Total number of bytes across all responses. | + +## Pulsar Functions + +All the Pulsar Functions metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_function_processed_successfully_total | Counter | The total number of messages processed successfully. | +| pulsar_function_processed_successfully_total_1min | Counter | The total number of messages processed successfully in the last 1 minute. | +| pulsar_function_system_exceptions_total | Counter | The total number of system exceptions. | +| pulsar_function_system_exceptions_total_1min | Counter | The total number of system exceptions in the last 1 minute. | +| pulsar_function_user_exceptions_total | Counter | The total number of user exceptions. | +| pulsar_function_user_exceptions_total_1min | Counter | The total number of user exceptions in the last 1 minute. | +| pulsar_function_process_latency_ms | Summary | The process latency in milliseconds. | +| pulsar_function_process_latency_ms_1min | Summary | The process latency in milliseconds in the last 1 minute. | +| pulsar_function_last_invocation | Gauge | The timestamp of the last invocation of the function. | +| pulsar_function_received_total | Counter | The total number of messages received from source. | +| pulsar_function_received_total_1min | Counter | The total number of messages received from source in the last 1 minute. | +pulsar_function_user_metric_ | Summary|The user-defined metrics. + +## Connectors + +All the Pulsar connector metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +Connector metrics contain **source** metrics and **sink** metrics. + +- **Source** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_source_written_total|Counter|The total number of records written to a Pulsar topic. + pulsar_source_written_total_1min|Counter|The total number of records written to a Pulsar topic in the last 1 minute. + pulsar_source_received_total|Counter|The total number of records received from source. + pulsar_source_received_total_1min|Counter|The total number of records received from source in the last 1 minute. + pulsar_source_last_invocation|Gauge|The timestamp of the last invocation of the source. + pulsar_source_source_exception|Gauge|The exception from a source. + pulsar_source_source_exceptions_total|Counter|The total number of source exceptions. + pulsar_source_source_exceptions_total_1min |Counter|The total number of source exceptions in the last 1 minute. + pulsar_source_system_exception|Gauge|The exception from system code. + pulsar_source_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_source_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_source_user_metric_ | Summary|The user-defined metrics. + +- **Sink** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_sink_written_total|Counter| The total number of records processed by a sink. + pulsar_sink_written_total_1min|Counter| The total number of records processed by a sink in the last 1 minute. + pulsar_sink_received_total_1min|Counter| The total number of messages that a sink has received from Pulsar topics in the last 1 minute. + pulsar_sink_received_total|Counter| The total number of records that a sink has received from Pulsar topics. + pulsar_sink_last_invocation|Gauge|The timestamp of the last invocation of the sink. + pulsar_sink_sink_exception|Gauge|The exception from a sink. + pulsar_sink_sink_exceptions_total|Counter|The total number of sink exceptions. + pulsar_sink_sink_exceptions_total_1min |Counter|The total number of sink exceptions in the last 1 minute. + pulsar_sink_system_exception|Gauge|The exception from system code. + pulsar_sink_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_sink_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_sink_user_metric_ | Summary|The user-defined metrics. + +## Proxy + +All the proxy metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *kubernetes_pod_name*: `kubernetes_pod_name=${kubernetes_pod_name}`. `${kubernetes_pod_name}` is the Kubernetes pod name. + +| Name | Type | Description | +|---|---|---| +| pulsar_proxy_active_connections | Gauge | Number of connections currently active in the proxy. | +| pulsar_proxy_new_connections | Counter | Counter of connections being opened in the proxy. | +| pulsar_proxy_rejected_connections | Counter | Counter for connections rejected due to throttling. | +| pulsar_proxy_binary_ops | Counter | Counter of proxy operations. | +| pulsar_proxy_binary_bytes | Counter | Counter of proxy bytes. | + +## Pulsar SQL Worker + +| Name | Type | Description | +|---|---|---| +| split_bytes_read | Counter | Number of bytes read from BookKeeper. | +| split_num_messages_deserialized | Counter | Number of messages deserialized. | +| split_num_record_deserialized | Counter | Number of records deserialized. | +| split_bytes_read_per_query | Summary | Total number of bytes read per query. | +| split_entry_deserialize_time | Summary | Time spent on derserializing entries. | +| split_entry_deserialize_time_per_query | Summary | Time spent on derserializing entries per query. | +| split_entry_queue_dequeue_wait_time | Summary | Time spend on waiting to get entry from entry queue because it is empty. | +| split_entry_queue_dequeue_wait_time_per_query | Summary | Total time spent on waiting to get entry from entry queue per query. | +| split_message_queue_dequeue_wait_time_per_query | Summary | Time spent on waiting to dequeue from message queue because is is empty per query. | +| split_message_queue_enqueue_wait_time | Summary | Time spent on waiting for message queue enqueue because the message queue is full. | +| split_message_queue_enqueue_wait_time_per_query | Summary | Time spent on waiting for message queue enqueue because the message queue is full per query. | +| split_num_entries_per_batch | Summary | Number of entries per batch. | +| split_num_entries_per_query | Summary | Number of entries per query. | +| split_num_messages_deserialized_per_entry | Summary | Number of messages deserialized per entry. | +| split_num_messages_deserialized_per_query | Summary | Number of messages deserialized per query. | +| split_read_attempts | Summary | Number of read attempts (fail if queues are full). | +| split_read_attempts_per_query | Summary | Number of read attempts per query. | +| split_read_latency_per_batch | Summary | Latency of reads per batch. | +| split_read_latency_per_query | Summary | Total read latency per query. | +| split_record_deserialize_time | Summary | Time spent on deserializing message to record. For example, Avro, JSON, and so on. | +| split_record_deserialize_time_per_query | Summary | Time spent on deserializing message to record per query. | +| split_total_execution_time | Summary | The total execution time. | + +## Pulsar transaction + +All the transaction metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *coordinator_id*: `coordinator_id=${coordinator_id}`. `${coordinator_id}` is the coordinator id. + +| Name | Type | Description | +|---|---|---| +| pulsar_txn_active_count | Gauge | Number of active transactions. | +| pulsar_txn_created_count | Counter | Number of created transactions. | +| pulsar_txn_committed_count | Counter | Number of committed transactions. | +| pulsar_txn_aborted_count | Counter | Number of aborted transactions of this coordinator. | +| pulsar_txn_timeout_count | Counter | Number of timeout transactions. | +| pulsar_txn_append_log_count | Counter | Number of append transaction logs. | +| pulsar_txn_execution_latency_le_* | Histogram | Transaction execution latency.
    Available latencies are as below:
    • latency="10" is TransactionExecutionLatency between (0ms, 10ms]
    • latency="20" is TransactionExecutionLatency between (10ms, 20ms]
    • latency="50" is TransactionExecutionLatency between (20ms, 50ms]
    • latency="100" is TransactionExecutionLatency between (50ms, 100ms]
    • latency="500" is TransactionExecutionLatency between (100ms, 500ms]
    • latency="1000" is TransactionExecutionLatency between (500ms, 1000ms]
    • latency="5000" is TransactionExecutionLatency between (1s, 5s]
    • latency="15000" is TransactionExecutionLatency between (5s, 15s]
    • latency="30000" is TransactionExecutionLatency between (15s, 30s]
    • latency="60000" is TransactionExecutionLatency between (30s, 60s]
    • latency="300000" is TransactionExecutionLatency between (1m,5m]
    • latency="1500000" is TransactionExecutionLatency between (5m,15m]
    • latency="3000000" is TransactionExecutionLatency between (15m,30m]
    • latency="overflow" is TransactionExecutionLatency between (30m,∞]
    | diff --git a/site2/website/versioned_docs/version-2.8.x/reference-pulsar-admin.md b/site2/website/versioned_docs/version-2.8.x/reference-pulsar-admin.md new file mode 100644 index 0000000000000..832a03e71afbd --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-pulsar-admin.md @@ -0,0 +1,3337 @@ +--- +id: reference-pulsar-admin +title: Pulsar admin CLI +sidebar_label: "Pulsar Admin CLI" +original_id: reference-pulsar-admin +--- + +:::tip + +This page is deprecated and not updated anymore. For the latest and complete information about `pulsar-admin`, including commands, flags, descriptions, and more, see [pulsar-admin doc](https://pulsar.apache.org/tools/pulsar-admin/). + +::: + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage + +```bash + +$ pulsar-admin command + +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `functions-worker` +* `namespaces` +* `ns-isolation-policy` +* `sources` + + For more information, see [here](io-cli.md#sources) +* `sinks` + + For more information, see [here](io-cli.md#sinks) +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash + +$ pulsar-admin broker-stats subcommand + +``` + +Subcommands +* `allocator-stats` +* `topics(destinations)` +* `mbeans` +* `monitoring-metrics` +* `load-report` + + +### `allocator-stats` + +Dump allocator stats + +Usage + +```bash + +$ pulsar-admin broker-stats allocator-stats allocator-name + +``` + +### `topics(destinations)` + +Dump topic stats + +Usage + +```bash + +$ pulsar-admin broker-stats topics options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage + +```bash + +$ pulsar-admin broker-stats mbeans options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage + +```bash + +$ pulsar-admin broker-stats monitoring-metrics options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `load-report` + +Dump broker load-report + +Usage + +```bash + +$ pulsar-admin broker-stats load-report + +``` + +## `brokers` + +Operations about brokers + +```bash + +$ pulsar-admin brokers subcommand + +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` +* `get-runtime-config` +* `healthcheck` + +### `list` +List active brokers of the cluster + +Usage + +```bash + +$ pulsar-admin brokers list cluster-name + +``` + +### `leader-broker` +Get the information of the leader broker + +Usage + +```bash + +$ pulsar-admin brokers leader-broker + +``` + +### `namespaces` +List namespaces owned by the broker + +Usage + +```bash + +$ pulsar-admin brokers namespaces cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage + +```bash + +$ pulsar-admin brokers update-dynamic-config options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage + +```bash + +$ pulsar-admin brokers list-dynamic-config + +``` + +### `delete-dynamic-config` +Delete dynamic-serviceConfiguration of broker + +Usage + +```bash + +$ pulsar-admin brokers delete-dynamic-config options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| + + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-all-dynamic-config + +``` + +### `get-internal-config` +Get internal configuration information + +Usage + +```bash + +$ pulsar-admin brokers get-internal-config + +``` + +### `get-runtime-config` +Get runtime configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-runtime-config + +``` + +### `healthcheck` +Run a health check against the broker + +Usage + +```bash + +$ pulsar-admin brokers healthcheck + +``` + +## `clusters` +Operations about clusters + +Usage + +```bash + +$ pulsar-admin clusters subcommand + +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` +* `get-peer-clusters` +* `get-failure-domain` +* `create-failure-domain` +* `update-failure-domain` +* `delete-failure-domain` +* `list-failure-domains` + + +### `get` +Get the configuration data for the specified cluster + +Usage + +```bash + +$ pulsar-admin clusters get cluster-name + +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin clusters create cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage + +```bash + +$ pulsar-admin clusters update cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage + +```bash + +$ pulsar-admin clusters delete cluster-name + +``` + +### `list` +List the existing clusters + +Usage + +```bash + +$ pulsar-admin clusters list + +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage + +```bash + +$ pulsar-admin clusters update-peer-clusters cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--peer-clusters`|Comma separated peer cluster names (Pass empty string "" to delete list)|| + +### `get-peer-clusters` +Get list of peer clusters + +Usage + +```bash + +$ pulsar-admin clusters get-peer-clusters + +``` + +### `get-failure-domain` +Get the configuration brokers of a failure domain + +Usage + +```bash + +$ pulsar-admin clusters get-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `create-failure-domain` +Create a new failure domain for a cluster (updates it if already created) + +Usage + +```bash + +$ pulsar-admin clusters create-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `update-failure-domain` +Update failure domain for a cluster (creates a new one if not exist) + +Usage + +```bash + +$ pulsar-admin clusters update-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `delete-failure-domain` +Delete an existing failure domain + +Usage + +```bash + +$ pulsar-admin clusters delete-failure-domain cluster-name options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `list-failure-domains` +List the existing failure domains for a cluster + +Usage + +```bash + +$ pulsar-admin clusters list-failure-domains cluster-name + +``` + +## `functions` + +A command-line interface for Pulsar Functions + +Usage + +```bash + +$ pulsar-admin functions subcommand + +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `restart` +* `stop` +* `start` +* `status` +* `stats` +* `list` +* `querystate` +* `putstate` +* `trigger` + + +### `localrun` +Run the Pulsar Function locally (rather than deploying it to the Pulsar cluster) + + +Usage + +```bash + +$ pulsar-admin functions localrun options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--broker-service-url `|The URL of the Pulsar broker|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--client-auth-params`|Client authentication param|| +|`--client-auth-plugin`|Client authentication plugin using which function-process can connect to broker|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--hostname-verification-enabled`|Enable hostname verification|false| +|`--instance-id-offset`|Start the instanceIds from this offset|0| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--state-storage-service-url`|The URL for the state storage service. By default, it it set to the service URL of the Apache BookKeeper. This service URL must be added manually when the Pulsar Function runs locally. || +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed successfully are sent|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--tls-allow-insecure`|Allow insecure tls connection|false| +|`--tls-trust-cert-path`|The tls trust cert file path|| +|`--use-tls`|Use tls connection|false| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `create` +Create a Pulsar Function in cluster mode (i.e. deploy it on a Pulsar cluster) + +Usage + +``` + +$ pulsar-admin functions create options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `delete` +Delete a Pulsar Function that's running on a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions delete options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `update` +Update a Pulsar Function that's been deployed to a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions update options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `get` +Fetch information about a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions get options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `restart` +Restart function instance + +Usage + +```bash + +$ pulsar-admin functions restart options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (restart all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stop` +Stops function instance + +Usage + +```bash + +$ pulsar-admin functions stop options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (stop all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `start` +Starts a stopped function instance + +Usage + +```bash + +$ pulsar-admin functions start options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (start all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `status` +Check the current status of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions status options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-status of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stats` +Get the current stats of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions stats options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-stats of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + +### `list` +List all of the Pulsar Functions running under a specific tenant and namespace + +Usage + +```bash + +$ pulsar-admin functions list options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `querystate` +Fetch the current state associated with a Pulsar Function running in cluster mode + +Usage + +```bash + +$ pulsar-admin functions querystate options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`-w`, `--watch`|Watch for changes in the value associated with a key for a Pulsar Function|false| + +### `putstate` +Put a key/value pair to the state associated with a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions putstate options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the Pulsar Function|| +|`--name`|The name of a Pulsar Function|| +|`--namespace`|The namespace of a Pulsar Function|| +|`--tenant`|The tenant of a Pulsar Function|| +|`-s`, `--state`|The FunctionState that needs to be put|| + +### `trigger` +Triggers the specified Pulsar Function with a supplied value + +Usage + +```bash + +$ pulsar-admin functions trigger options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`--topic`|The specific topic name that the function consumes from that you want to inject the data to|| +|`--trigger-file`|The path to the file that contains the data with which you'd like to trigger the function|| +|`--trigger-value`|The value with which you want to trigger the function|| + + +## `functions-worker` +Operations to collect function-worker statistics + +```bash + +$ pulsar-admin functions-worker subcommand + +``` + +Subcommands + +* `function-stats` +* `get-cluster` +* `get-cluster-leader` +* `get-function-assignments` +* `monitoring-metrics` + +### `function-stats` + +Dump all functions stats running on this broker + +Usage + +```bash + +$ pulsar-admin functions-worker function-stats + +``` + +### `get-cluster` + +Get all workers belonging to this cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster + +``` + +### `get-cluster-leader` + +Get the leader of the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster-leader + +``` + +### `get-function-assignments` + +Get the assignments of the functions across the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-function-assignments + +``` + +### `monitoring-metrics` + +Dump metrics for Monitoring + +Usage + +```bash + +$ pulsar-admin functions-worker monitoring-metrics + +``` + +## `namespaces` + +Operations for managing namespaces + +```bash + +$ pulsar-admin namespaces subcommand + +``` + +Subcommands +* `list` +* `topics` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `set-auto-topic-creation` +* `remove-auto-topic-creation` +* `set-auto-subscription-creation` +* `remove-auto-subscription-creation` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `grant-subscription-permission` +* `revoke-subscription-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-anti-affinity-group` +* `set-anti-affinity-group` +* `get-anti-affinity-namespaces` +* `delete-anti-affinity-group` +* `get-retention` +* `set-retention` +* `unload` +* `split-bundle` +* `set-dispatch-rate` +* `get-dispatch-rate` +* `set-replicator-dispatch-rate` +* `get-replicator-dispatch-rate` +* `set-subscribe-rate` +* `get-subscribe-rate` +* `set-subscription-dispatch-rate` +* `get-subscription-dispatch-rate` +* `clear-backlog` +* `unsubscribe` +* `set-encryption-required` +* `set-delayed-delivery` +* `get-delayed-delivery` +* `set-subscription-auth-mode` +* `get-max-producers-per-topic` +* `set-max-producers-per-topic` +* `get-max-consumers-per-topic` +* `set-max-consumers-per-topic` +* `get-max-consumers-per-subscription` +* `set-max-consumers-per-subscription` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` +* `get-offload-deletion-lag` +* `set-offload-deletion-lag` +* `clear-offload-deletion-lag` +* `get-schema-autoupdate-strategy` +* `set-schema-autoupdate-strategy` +* `set-offload-policies` +* `get-offload-policies` +* `set-max-subscriptions-per-topic` +* `get-max-subscriptions-per-topic` +* `remove-max-subscriptions-per-topic` + + +### `list` +Get the namespaces for a tenant + +Usage + +```bash + +$ pulsar-admin namespaces list tenant-name + +``` + +### `topics` +Get the list of topics for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces topics tenant/namespace + +``` + +### `policies` +Get the configuration policies of a namespace + +Usage + +```bash + +$ pulsar-admin namespaces policies tenant/namespace + +``` + +### `create` +Create a new namespace + +Usage + +```bash + +$ pulsar-admin namespaces create tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundles`|The number of bundles to activate|0| +|`-c`, `--clusters`|List of clusters this namespace will be assigned|| + + +### `delete` +Deletes a namespace. The namespace needs to be empty + +Usage + +```bash + +$ pulsar-admin namespaces delete tenant/namespace + +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-deduplication tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + +### `set-auto-topic-creation` +Enable or disable autoTopicCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-topic-creation tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoTopicCreation on namespace|false| +|`--disable`, `-d`|Disable allowAutoTopicCreation on namespace|false| +|`--type`, `-t`|Type of topic to be auto-created. Possible values: (partitioned, non-partitioned)|non-partitioned| +|`--num-partitions`, `-n`|Default number of partitions of topic to be auto-created, applicable to partitioned topics only|| + +### `remove-auto-topic-creation` +Remove override of autoTopicCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-topic-creation tenant/namespace + +``` + +### `set-auto-subscription-creation` +Enable autoSubscriptionCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-subscription-creation tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoSubscriptionCreation on namespace|false| + +### `remove-auto-subscription-creation` +Remove override of autoSubscriptionCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-subscription-creation tenant/namespace + +``` + +### `permissions` +Get the permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces permissions tenant/namespace + +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces grant-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces revoke-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| + +### `grant-subscription-permission` +Grant permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces grant-subscription-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--roles`|The client roles to which to grant the permissions (comma separated roles)|| +|`--subscription`|The subscription name for which permission will be granted to roles|| + +### `revoke-subscription-permission` +Revoke permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces revoke-subscription-permission tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| +|`--subscription`|The subscription name for which permission will be revoked to roles|| + +### `set-clusters` +Set replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-clusters tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-clusters tenant/namespace + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-backlog-quotas tenant/namespace + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-backlog-quota tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| + +Example + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limit 2G \ +--policy producer_request_hold + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-backlog-quota tenant/namespace + +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-persistence tenant/namespace + +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-persistence tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorum`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-message-ttl tenant/namespace + +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-message-ttl tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds. When the value is set to `0`, TTL is disabled. TTL is disabled by default. |0| + +### `remove-message-ttl` +Remove the message TTL for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-message-ttl tenant/namespace + +``` + +### `get-anti-affinity-group` +Get Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-group tenant/namespace + +``` + +### `set-anti-affinity-group` +Set Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-anti-affinity-group tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-g`, `--group`|Anti-affinity group name|| + +### `get-anti-affinity-namespaces` +Get Anti-affinity namespaces grouped with the given anti-affinity group name + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-namespaces options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--cluster`|Cluster name|| +|`-g`, `--group`|Anti-affinity group name|| +|`-p`, `--tenant`|Tenant is only used for authorization. Client has to be admin of any of the tenant to access this api|| + +### `delete-anti-affinity-group` +Remove Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces delete-anti-affinity-group tenant/namespace + +``` + +### `get-retention` +Get the retention policy that is applied to each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-retention tenant/namespace + +``` + +### `set-retention` +Set the retention policy for each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-retention tenant/namespace + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T) for each topic in the namespace. 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage + +```bash + +$ pulsar-admin namespaces unload tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| + +### `split-bundle` +Split a namespace-bundle from the current serving broker + +Usage + +```bash + +$ pulsar-admin namespaces split-bundle tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-u`, `--unload`|Unload newly split bundles after splitting old bundle|false| + +### `set-dispatch-rate` +Set message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-dispatch-rate` +Get configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-dispatch-rate tenant/namespace + +``` + +### `set-replicator-dispatch-rate` +Set replicator message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-replicator-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-replicator-dispatch-rate` +Get replicator configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-replicator-dispatch-rate tenant/namespace + +``` + +### `set-subscribe-rate` +Set subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscribe-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-sr`, `--subscribe-rate`|The subscribe rate (default -1 will be overwrite if not passed)|-1| +|`-st`, `--subscribe-rate-period`|The subscribe rate period in second type (default 30 second will be overwrite if not passed)|30| + +### `get-subscribe-rate` +Get configured subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-subscribe-rate tenant/namespace + +``` + +### `set-subscription-dispatch-rate` +Set subscription message-dispatch-rate for all subscription of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-dispatch-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--sub-msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-subscription-dispatch-rate` +Get subscription configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-subscription-dispatch-rate tenant/namespace + +``` + +### `clear-backlog` +Clear the backlog for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-backlog tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-force`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces unsubscribe tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-s`, `--sub`|The subscription name|| + +### `set-encryption-required` +Enable or disable message encryption required for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-encryption-required tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable message encryption required|false| +|`-e`, `--enable`|Enable message encryption required|false| + +### `set-delayed-delivery` +Set the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-delayed-delivery tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable delayed delivery messages|false| +|`-e`, `--enable`|Enable delayed delivery messages|false| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `get-delayed-delivery` +Get the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-delayed-delivery-time tenant/namespace + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `set-subscription-auth-mode` +Set subscription auth mode on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-auth-mode tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-m`, `--subscription-auth-mode`|Subscription authorization mode for Pulsar policies. Valid options are: [None, Prefix]|| + +### `get-max-producers-per-topic` +Get maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-producers-per-topic tenant/namespace + +``` + +### `set-max-producers-per-topic` +Set maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-producers-per-topic tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-p`, `--max-producers-per-topic`|maxProducersPerTopic for a namespace|0| + +### `get-max-consumers-per-topic` +Get maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-topic tenant/namespace + +``` + +### `set-max-consumers-per-topic` +Set maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-topic tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-topic`|maxConsumersPerTopic for a namespace|0| + +### `get-max-consumers-per-subscription` +Get maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-subscription tenant/namespace + +``` + +### `set-max-consumers-per-subscription` +Set maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-subscription tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-subscription`|maxConsumersPerSubscription for a namespace|0| + +### `get-max-unacked-messages-per-subscription` +Get maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-subscription tenant/namespace + +``` + +### `set-max-unacked-messages-per-subscription` +Set maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-subscription tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-subscription`|maxUnackedMessagesPerSubscription for a namespace|-1| + +### `get-max-unacked-messages-per-consumer` +Get maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-consumer tenant/namespace + +``` + +### `set-max-unacked-messages-per-consumer` +Set maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-consumer tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-consumer`|maxUnackedMessagesPerConsumer for a namespace|-1| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace + +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-threshold tenant/namespace + +``` + +### `set-offload-threshold` +Set offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-threshold tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + +### `get-offload-deletion-lag` +Get offloadDeletionLag, in minutes, for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-deletion-lag tenant/namespace + +``` + +### `set-offload-deletion-lag` +Set offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-deletion-lag tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--lag`|Duration to wait after offloading a ledger segment, before deleting the copy of that segment from cluster local storage. (eg: 10m, 5h, 3d, 2w).|-1| + +### `clear-offload-deletion-lag` +Clear offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-offload-deletion-lag tenant/namespace + +``` + +### `get-schema-autoupdate-strategy` +Get the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-schema-autoupdate-strategy tenant/namespace + +``` + +### `set-schema-autoupdate-strategy` +Set the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-schema-autoupdate-strategy tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--compatibility`|Compatibility level required for new schemas created via a Producer. Possible values (Full, Backward, Forward, None).|Full| +|`-d`, `--disabled`|Disable automatic schema updates.|false| + +### `get-publish-rate` +Get the message publish rate for each topic in a namespace, in bytes as well as messages per second + +Usage + +```bash + +$ pulsar-admin namespaces get-publish-rate tenant/namespace + +``` + +### `set-publish-rate` +Set the message publish rate for each topic in a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-publish-rate tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-m`, `--msg-publish-rate`|Threshold for number of messages per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| +|`-b`, `--byte-publish-rate`|Threshold for number of bytes per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| + +### `set-offload-policies` +Set the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-policies tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--driver`|Driver to use to offload old data to long term storage,(Possible values: S3, aws-s3, google-cloud-storage)|| +|`-r`, `--region`|The long term storage region|| +|`-b`, `--bucket`|Bucket to place offloaded ledger into|| +|`-e`, `--endpoint`|Alternative endpoint to connect to|| +|`-i`, `--aws-id`|AWS Credential Id to use when using driver S3 or aws-s3|| +|`-s`, `--aws-secret`|AWS Credential Secret to use when using driver S3 or aws-s3|| +|`-ro`, `--s3-role`|S3 Role used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-rsn`, `--s3-role-session-name`|S3 role session name used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-mbs`, `--maxBlockSize`|Max block size|64MB| +|`-rbs`, `--readBufferSize`|Read buffer size|1MB| +|`-oat`, `--offloadAfterThreshold`|Offload after threshold size (eg: 1M, 5M)|| +|`-oae`, `--offloadAfterElapsed`|Offload after elapsed in millis (or minutes, hours,days,weeks eg: 100m, 3h, 2d, 5w).|| + +### `get-offload-policies` +Get the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-policies tenant/namespace + +``` + +### `set-max-subscriptions-per-topic` +Set the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-max-subscriptions-per-topic tenant/namespace + +``` + +### `get-max-subscriptions-per-topic` +Get the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-max-subscriptions-per-topic tenant/namespace + +``` + +### `remove-max-subscriptions-per-topic` +Remove the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-max-subscriptions-per-topic tenant/namespace + +``` + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy subcommand + +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` +* `brokers` +* `broker` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy get cluster-name policy-name + +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy list cluster-name + +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy delete + +``` + +### `brokers` +List all brokers with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy brokers cluster-name + +``` + +### `broker` +Get broker with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy broker cluster-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--broker`|Broker name to get namespace-isolation policies attached to it|| + +## `topics` +Operations for managing Pulsar topics (both persistent and non-persistent). + +Usage + +```bash + +$ pulsar-admin topics subcommand + +``` + +From Pulsar 2.7.0, some namespace-level policies are available on topic level. To enable topic-level policy in Pulsar, you need to configure the following parameters in the `broker.conf` file. + +```shell + +systemTopicEnabled=true +topicLevelPoliciesEnabled=true + +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `create-missed-partitions` +* `delete-partitioned-topic` +* `create` +* `get-partitioned-topic-metadata` +* `update-partitioned-topic` +* `list-partitioned-topics` +* `list` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `create-subscription` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `partitioned-stats-internal` +* `skip` +* `clear-backlog` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` +* `get-message-by-id` +* `last-message-id` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `remove-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-deduplication` +* `set-deduplication` +* `remove-deduplication` +* `get-retention` +* `set-retention` +* `remove-retention` +* `get-dispatch-rate` +* `set-dispatch-rate` +* `remove-dispatch-rate` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `remove-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `remove-max-unacked-messages-per-consumer` +* `get-delayed-delivery` +* `set-delayed-delivery` +* `remove-delayed-delivery` +* `get-max-producers` +* `set-max-producers` +* `remove-max-producers` +* `get-max-consumers` +* `set-max-consumers` +* `remove-max-consumers` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `remove-compaction-threshold` +* `get-offload-policies` +* `set-offload-policies` +* `remove-offload-policies` +* `get-inactive-topic-policies` +* `set-inactive-topic-policies` +* `remove-inactive-topic-policies` +* `set-max-subscriptions` +* `get-max-subscriptions` +* `remove-max-subscriptions` + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage + +``` + +$ pulsar-admin topics compact persistent://tenant/namespace/topic + +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage + +```bash + +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage + +```bash + +$ pulsar-admin topics offload persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage + +```bash + +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `create-missed-partitions` +Try to create partitions for partitioned topic. The partitions of partition topic has to be created, +can be used by repair partitions when topic auto creation is disabled + +Usage + +```bash + +$ pulsar-admin topics create-missed-partitions persistent://tenant/namespace/topic + +``` + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage + +```bash + +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} + +``` + +### `create` +Creates a non-partitioned topic. A non-partitioned topic must explicitly be created by the user if allowAutoTopicCreation or createIfMissing is disabled. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage + +```bash + +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `update-partitioned-topic` +Update existing non-global partitioned topic. New updating number of partitions must be greater than existing number of partitions. + +Usage + +```bash + +$ pulsar-admin topics update-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `list-partitioned-topics` +Get the list of partitioned topics under a namespace. + +Usage + +```bash + +$ pulsar-admin topics list-partitioned-topics tenant/namespace + +``` + +### `list` +Get the list of topics under a namespace + +Usage + +``` + +$ pulsar-admin topics list tenant/cluster/namespace + +``` + +### `terminate` +Terminate a persistent topic (disallow further messages from being published on the topic) + +Usage + +```bash + +$ pulsar-admin topics terminate persistent://tenant/namespace/topic + +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a destination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage + +```bash + +$ pulsar-admin topics permissions topic + +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage + +```bash + +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage + +```bash + +$ pulsar-admin topics revoke-permission topic + +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage + +```bash + +$ pulsar-admin topics lookup topic + +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage + +```bash + +$ pulsar-admin topics bundle-range topic + +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage + +```bash + +$ pulsar-admin topics delete topic + +``` + +### `unload` +Unload a topic + +Usage + +```bash + +$ pulsar-admin topics unload topic + +``` + +### `create-subscription` +Create a new subscription on a topic. + +Usage + +```bash + +$ pulsar-admin topics create-subscription [options] persistent://tenant/namespace/topic + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-m`, `--messageId`|messageId where to create the subscription. It can be either 'latest', 'earliest' or (ledgerId:entryId)|latest| +|`-s`, `--subscription`|Subscription to reset position on|| + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage + +```bash + +$ pulsar-admin topics subscriptions topic + +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage + +```bash + +$ pulsar-admin topics unsubscribe topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| +|`-f`, `--force`|Disconnect and close all consumers and delete subscription forcefully|false| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics stats topic + +``` + +:::note + +The unit of `storageSize` and `averageMsgSize` is Byte. + +::: + +### `stats-internal` +Get the internal stats for the topic + +Usage + +```bash + +$ pulsar-admin topics stats-internal topic + +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage + +```bash + +$ pulsar-admin topics info-internal topic + +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + +### `partitioned-stats-internal` +Get the internal stats for the partitioned topic and its connected producers and consumers. All the rates are computed over a 1 minute window and are relative the last completed 1 minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats-internal topic + +``` + +### `skip` +Skip some messages for the subscription + +Usage + +```bash + +$ pulsar-admin topics skip topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `clear-backlog` +Clear backlog (skip all the messages) for the subscription + +Usage + +```bash + +$ pulsar-admin topics clear-backlog topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage + +```bash + +$ pulsar-admin topics expire-messages topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage + +```bash + +$ pulsar-admin topics expire-messages-all-subscriptions topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage + +```bash + +$ pulsar-admin topics peek-messages topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to a position that is closest to timestamp or messageId. + +Usage + +```bash + +$ pulsar-admin topics reset-cursor topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time in minutes to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| +|`-m`, `--messageId`| The messageId to reset back to (ledgerId:entryId). || + +### `get-message-by-id` +Get message by ledger id and entry id + +Usage + +```bash + +$ pulsar-admin topics get-message-by-id topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-l`, `--ledgerId`|The ledger id |0| +|`-e`, `--entryId`|The entry id |0| + +### `last-message-id` +Get the last commit message ID of the topic. + +Usage + +```bash + +$ pulsar-admin topics last-message-id persistent://tenant/namespace/topic + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-backlog-quotas tenant/namespace/topic + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-backlog-quota tenant/namespace/topic options + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-backlog-quota tenant/namespace/topic + +``` + +### `get-persistence` +Get the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-persistence tenant/namespace/topic + +``` + +### `set-persistence` +Set the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-persistence tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-e`, `--bookkeeper-ensemble`|Number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-a`, `--bookkeeper-ack-quorum`|Number of acks (guaranteed copies) to wait for each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + +### `remove-persistence` +Remove the persistence policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-persistence tenant/namespace/topic + +``` + +### `get-message-ttl` +Get the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-message-ttl tenant/namespace/topic + +``` + +### `set-message-ttl` +Set the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-message-ttl tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL for a topic in second, allowed range from 1 to `Integer.MAX_VALUE` |0| + +### `remove-message-ttl` +Remove the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-message-ttl tenant/namespace/topic + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified topic.|false| +|`--disable`, `-d`|Disable message deduplication on the specified topic.|false| + +### `get-deduplication` +Get a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-deduplication tenant/namespace/topic + +``` + +### `set-deduplication` +Set a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-deduplication tenant/namespace/topic options + +``` + +### `remove-deduplication` +Remove a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-deduplication tenant/namespace/topic + +``` + +## `tenants` +Operations for managing tenants + +Usage + +```bash + +$ pulsar-admin tenants subcommand + +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage + +```bash + +$ pulsar-admin tenants list + +``` + +### `get` +Gets the configuration of a tenant + +Usage + +```bash + +$ pulsar-admin tenants get tenant-name + +``` + +### `create` +Creates a new tenant + +Usage + +```bash + +$ pulsar-admin tenants create tenant-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage + +```bash + +$ pulsar-admin tenants update tenant-name options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage + +```bash + +$ pulsar-admin tenants delete tenant-name + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-f`, `--force`|Delete a tenant forcefully by deleting all namespaces under it.|false| + + +## `resource-quotas` +Operations for managing resource quotas + +Usage + +```bash + +$ pulsar-admin resource-quotas subcommand + +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas get options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas set options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specified namespace bundle's resource quota to a default value. + +Usage + +```bash + +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage + +``` + +$ pulsar-admin schemas subcommand + +``` + +Subcommands +* `upload` +* `delete` +* `get` +* `extract` + + +### `upload` +Upload the schema definition for a topic + +Usage + +```bash + +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage + +```bash + +$ pulsar-admin schemas delete persistent://tenant/namespace/topic + +``` + +### `get` +Retrieve the schema definition associated with a topic (at a given version if version is supplied). + +Usage + +```bash + +$ pulsar-admin schemas get persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrieve for a topic.|| + +### `extract` +Provide the schema definition for a topic via Java class name contained in a JAR file + +Usage + +```bash + +$ pulsar-admin schemas extract persistent://tenant/namespace/topic options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--classname`|The Java class name|| +|`-j`, `--jar`|A path to the JAR file which contains the above Java class|| +|`-t`, `--type`|The type of the schema (avro or json)|| diff --git a/site2/website/versioned_docs/version-2.8.x/reference-rest-api-overview.md b/site2/website/versioned_docs/version-2.8.x/reference-rest-api-overview.md new file mode 100644 index 0000000000000..4bdcf23483a2b --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-rest-api-overview.md @@ -0,0 +1,18 @@ +--- +id: reference-rest-api-overview +title: Pulsar REST APIs +sidebar_label: "Pulsar REST APIs" +--- + +A REST API (also known as RESTful API, REpresentational State Transfer Application Programming Interface) is a set of definitions and protocols for building and integrating application software, using HTTP requests to GET, PUT, POST, and DELETE data following the REST standards. In essence, REST API is a set of remote calls using standard methods to request and return data in a specific format between two systems. + +Pulsar provides a variety of REST APIs that enable you to interact with Pulsar to retrieve information or perform an action. + +| REST API category | Description | +| --- | --- | +| [Admin](https://pulsar.apache.org/admin-rest-api/?version=master) | REST APIs for administrative operations.| +| [Functions](https://pulsar.apache.org/functions-rest-api/?version=master) | REST APIs for function-specific operations.| +| [Sources](https://pulsar.apache.org/source-rest-api/?version=master) | REST APIs for source-specific operations.| +| [Sinks](https://pulsar.apache.org/sink-rest-api/?version=master) | REST APIs for sink-specific operations.| +| [Packages](https://pulsar.apache.org/packages-rest-api/?version=master) | REST APIs for package-specific operations. A package can be a group of functions, sources, and sinks.| + diff --git a/site2/website/versioned_docs/version-2.8.x/reference-terminology.md b/site2/website/versioned_docs/version-2.8.x/reference-terminology.md new file mode 100644 index 0000000000000..e5099141c3231 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/reference-terminology.md @@ -0,0 +1,176 @@ +--- +id: reference-terminology +title: Pulsar Terminology +sidebar_label: "Terminology" +original_id: reference-terminology +--- + +Here is a glossary of terms related to Apache Pulsar: + +### Concepts + +#### Pulsar + +Pulsar is a distributed messaging system originally created by Yahoo but now under the stewardship of the Apache Software Foundation. + +#### Message + +Messages are the basic unit of Pulsar. They're what [producers](#producer) publish to [topics](#topic) +and what [consumers](#consumer) then consume from topics. + +#### Topic + +A named channel used to pass messages published by [producers](#producer) to [consumers](#consumer) who +process those [messages](#message). + +#### Partitioned Topic + +A topic that is served by multiple Pulsar [brokers](#broker), which enables higher throughput. + +#### Namespace + +A grouping mechanism for related [topics](#topic). + +#### Namespace Bundle + +A virtual group of [topics](#topic) that belong to the same [namespace](#namespace). A namespace bundle +is defined as a range between two 32-bit hashes, such as 0x00000000 and 0xffffffff. + +#### Tenant + +An administrative unit for allocating capacity and enforcing an authentication/authorization scheme. + +#### Subscription + +A lease on a [topic](#topic) established by a group of [consumers](#consumer). Pulsar has four subscription +modes (exclusive, shared, failover and key_shared). + +#### Pub-Sub + +A messaging pattern in which [producer](#producer) processes publish messages on [topics](#topic) that +are then consumed (processed) by [consumer](#consumer) processes. + +#### Producer + +A process that publishes [messages](#message) to a Pulsar [topic](#topic). + +#### Consumer + +A process that establishes a subscription to a Pulsar [topic](#topic) and processes messages published +to that topic by [producers](#producer). + +#### Reader + +Pulsar readers are message processors much like Pulsar [consumers](#consumer) but with two crucial differences: + +- you can specify *where* on a topic readers begin processing messages (consumers always begin with the latest + available unacked message); +- readers don't retain data or acknowledge messages. + +#### Cursor + +The subscription position for a [consumer](#consumer). + +#### Acknowledgment (ack) + +A message sent to a Pulsar broker by a [consumer](#consumer) that a message has been successfully processed. +An acknowledgement (ack) is Pulsar's way of knowing that the message can be deleted from the system; +if no acknowledgement, then the message will be retained until it's processed. + +#### Negative Acknowledgment (nack) + +When an application fails to process a particular message, it can send a "negative ack" to Pulsar +to signal that the message should be replayed at a later timer. (By default, failed messages are +replayed after a 1 minute delay). Be aware that negative acknowledgment on ordered subscription types, +such as Exclusive, Failover and Key_Shared, can cause failed messages to arrive consumers out of the original order. + +#### Unacknowledged + +A message that has been delivered to a consumer for processing but not yet confirmed as processed by the consumer. + +#### Retention Policy + +Size and time limits that you can set on a [namespace](#namespace) to configure retention of [messages](#message) +that have already been [acknowledged](#acknowledgement-ack). + +#### Multi-Tenancy + +The ability to isolate [namespaces](#namespace), specify quotas, and configure authentication and authorization +on a per-[tenant](#tenant) basis. + +#### Failure Domain + +A logical domain under a Pulsar cluster. Each logical domain contains a pre-configured list of brokers. + +#### Anti-affinity Namespaces + +A group of namespaces that have anti-affinity to each other. + +### Architecture + +#### Standalone + +A lightweight Pulsar broker in which all components run in a single Java Virtual Machine (JVM) process. Standalone +clusters can be run on a single machine and are useful for development purposes. + +#### Cluster + +A set of Pulsar [brokers](#broker) and [BookKeeper](#bookkeeper) servers (aka [bookies](#bookie)). +Clusters can reside in different geographical regions and replicate messages to one another +in a process called [geo-replication](#geo-replication). + +#### Instance + +A group of Pulsar [clusters](#cluster) that act together as a single unit. + +#### Geo-Replication + +Replication of messages across Pulsar [clusters](#cluster), potentially in different datacenters +or geographical regions. + +#### Configuration Store + +Pulsar's configuration store (previously known as configuration store) is a ZooKeeper quorum that +is used for configuration-specific tasks. A multi-cluster Pulsar installation requires just one +configuration store across all [clusters](#cluster). + +#### Topic Lookup + +A service provided by Pulsar [brokers](#broker) that enables connecting clients to automatically determine +which Pulsar [cluster](#cluster) is responsible for a [topic](#topic) (and thus where message traffic for +the topic needs to be routed). + +#### Service Discovery + +A mechanism provided by Pulsar that enables connecting clients to use just a single URL to interact +with all the [brokers](#broker) in a [cluster](#cluster). + +#### Broker + +A stateless component of Pulsar [clusters](#cluster) that runs two other components: an HTTP server +exposing a REST interface for administration and topic lookup and a [dispatcher](#dispatcher) that +handles all message transfers. Pulsar clusters typically consist of multiple brokers. + +#### Dispatcher + +An asynchronous TCP server used for all data transfers in-and-out a Pulsar [broker](#broker). The Pulsar +dispatcher uses a custom binary protocol for all communications. + +### Storage + +#### BookKeeper + +[Apache BookKeeper](http://bookkeeper.apache.org/) is a scalable, low-latency persistent log storage +service that Pulsar uses to store data. + +#### Bookie + +Bookie is the name of an individual BookKeeper server. It is effectively the storage server of Pulsar. + +#### Ledger + +An append-only data structure in [BookKeeper](#bookkeeper) that is used to persistently store messages in Pulsar [topics](#topic). + +### Functions + +Pulsar Functions are lightweight functions that can consume messages from Pulsar topics, apply custom processing logic, and, if desired, publish results to topics. diff --git a/site2/website/versioned_docs/version-2.8.x/schema-evolution-compatibility.md b/site2/website/versioned_docs/version-2.8.x/schema-evolution-compatibility.md new file mode 100644 index 0000000000000..3e78429df69da --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/schema-evolution-compatibility.md @@ -0,0 +1,201 @@ +--- +id: schema-evolution-compatibility +title: Schema evolution and compatibility +sidebar_label: "Schema evolution and compatibility" +original_id: schema-evolution-compatibility +--- + +Normally, schemas do not stay the same over a long period of time. Instead, they undergo evolutions to satisfy new needs. + +This chapter examines how Pulsar schema evolves and what Pulsar schema compatibility check strategies are. + +## Schema evolution + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +Each `SchemaInfo` stored with a topic has a version. The version is used to manage the schema changes happening within a topic. + +The message produced with `SchemaInfo` is tagged with a schema version. When a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and use the correct schema information to deserialize data. + +### What is schema evolution? + +Schemas store the details of attributes and types. To satisfy new business requirements, you need to update schemas inevitably over time, which is called **schema evolution**. + +Any schema changes affect downstream consumers. Schema evolution ensures that the downstream consumers can seamlessly handle data encoded with both old schemas and new schemas. + +### How Pulsar schema should evolve? + +The answer is Pulsar schema compatibility check strategy. It determines how schema compares old schemas with new schemas in topics. + +For more information, see [Schema compatibility check strategy](#schema-compatibility-check-strategy). + +### How does Pulsar support schema evolution? + +1. When a producer/consumer/reader connects to a broker, the broker deploys the schema compatibility checker configured by `schemaRegistryCompatibilityCheckers` to enforce schema compatibility check. + + The schema compatibility checker is one instance per schema type. + + Currently, Avro and JSON have their own compatibility checkers, while all the other schema types share the default compatibility checker which disables schema evolution. + +2. The producer/consumer/reader sends its client `SchemaInfo` to the broker. + +3. The broker knows the schema type and locates the schema compatibility checker for that type. + +4. The broker uses the checker to check if the `SchemaInfo` is compatible with the latest schema of the topic by applying its compatibility check strategy. + + Currently, the compatibility check strategy is configured at the namespace level and applied to all the topics within that namespace. + +## Schema compatibility check strategy + +Pulsar has 8 schema compatibility check strategies, which are summarized in the following table. + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Changes allowed | Check against which schema | Upgrade first | +| --- | --- | --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | All changes are allowed | All previous versions | Any order | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution. | All changes are disabled | None | None | +| `BACKWARD` | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. |
  • Add optional fields
  • Delete fields
  • | Latest version | Consumers | +| `BACKWARD_TRANSITIVE` | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. |
  • Add optional fields
  • Delete fields
  • | All previous versions | Consumers | +| `FORWARD` | Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | Latest version | Producers | +| `FORWARD_TRANSITIVE` | Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | All previous versions | Producers | +| `FULL` | Backward and forward compatible between the schema V3 and V2. |
  • Modify optional fields
  • | Latest version | Any order | +| `FULL_TRANSITIVE` | Backward and forward compatible among the schema V3, V2, and V1. |
  • Modify optional fields
  • | All previous versions | Any order | + +### ALWAYS_COMPATIBLE and ALWAYS_INCOMPATIBLE + +| Compatibility check strategy | Definition | Note | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | None | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution, that is, any schema change is rejected. |
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • | + +#### Example + +* Example 1 + + In some situations, an application needs to store events of several different types in the same Pulsar topic. + + In particular, when developing a data model in an `Event Sourcing` style, you might have several kinds of events that affect the state of an entity. + + For example, for a user entity, there are `userCreated`, `userAddressChanged` and `userEnquiryReceived` events. The application requires that those events are always read in the same order. + + Consequently, those events need to go in the same Pulsar partition to maintain order. This application can use `ALWAYS_COMPATIBLE` to allow different kinds of events co-exist in the same topic. + +* Example 2 + + Sometimes we also make incompatible changes. + + For example, you are modifying a field type from `string` to `int`. + + In this case, you need to: + + * Upgrade all producers and consumers to the new schema versions at the same time. + + * Optionally, create a new topic and start migrating applications to use the new topic and the new schema, avoiding the need to handle two incompatible versions in the same topic. + +### BACKWARD and BACKWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`BACKWARD` | Consumers using the new schema can process data written by producers using the **last schema**. | The consumers using the schema V3 can process data written by producers using the schema V3 or V2. | +`BACKWARD_TRANSITIVE` | Consumers using the new schema can process data written by producers using **all previous schemas**. | The consumers using the schema V3 can process data written by producers using the schema V3, V2, or V1. | + +#### Example + +* Example 1 + + Remove a field. + + A consumer constructed to process events without one field can process events written with the old schema containing the field, and the consumer will ignore that field. + +* Example 2 + + You want to load all Pulsar data into a Hive data warehouse and run SQL queries against the data. + + Same SQL queries must continue to work even the data is changed. To support it, you can evolve the schemas using the `BACKWARD` strategy. + +### FORWARD and FORWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`FORWARD` | Consumers using the **last schema** can process data written by producers using a new schema, even though they may not be able to use the full capabilities of the new schema. | The consumers using the schema V3 or V2 can process data written by producers using the schema V3. | +`FORWARD_TRANSITIVE` | Consumers using **all previous schemas** can process data written by producers using a new schema. | The consumers using the schema V3, V2, or V1 can process data written by producers using the schema V3. + +#### Example + +* Example 1 + + Add a field. + + In most data formats, consumers written to process events without new fields can continue doing so even when they receive new events containing new fields. + +* Example 2 + + If a consumer has an application logic tied to a full version of a schema, the application logic may not be updated instantly when the schema evolves. + + In this case, you need to project data with a new schema onto an old schema that the application understands. + + Consequently, you can evolve the schemas using the `FORWARD` strategy to ensure that the old schema can process data encoded with the new schema. + +### FULL and FULL_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | Note | +| --- | --- | --- | --- | +| `FULL` | Schemas are both backward and forward compatible, which means: Consumers using the last schema can process data written by producers using the new schema. AND Consumers using the new schema can process data written by producers using the last schema. | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. AND Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • | +| `FULL_TRANSITIVE` | The new schema is backward and forward compatible with all previously registered schemas. | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. AND Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. | None | + +#### Example + +In some data formats, for example, Avro, you can define fields with default values. Consequently, adding or removing a field with a default value is a fully compatible change. + +## Schema verification + +When a producer or a consumer tries to connect to a topic, a broker performs some checks to verify a schema. + +### Producer + +When a producer tries to connect to a topic (suppose ignore the schema auto creation), a broker does the following checks: + +* Check if the schema carried by the producer exists in the schema registry or not. + + * If the schema is already registered, then the producer is connected to a broker and produce messages with that schema. + + * If the schema is not registered, then Pulsar verifies if the schema is allowed to be registered based on the configured compatibility check strategy. + +### Consumer +When a consumer tries to connect to a topic, a broker checks if a carried schema is compatible with a registered schema based on the configured schema compatibility check strategy. + +| Compatibility check strategy | Check logic | +| --- | --- | +| `ALWAYS_COMPATIBLE` | All pass | +| `ALWAYS_INCOMPATIBLE` | No pass | +| `BACKWARD` | Can read the last schema | +| `BACKWARD_TRANSITIVE` | Can read all schemas | +| `FORWARD` | Can read the last schema | +| `FORWARD_TRANSITIVE` | Can read the last schema | +| `FULL` | Can read the last schema | +| `FULL_TRANSITIVE` | Can read all schemas | + +## Order of upgrading clients + +The order of upgrading client applications is determined by the compatibility check strategy. + +For example, the producers using schemas to write data to Pulsar and the consumers using schemas to read data from Pulsar. + +| Compatibility check strategy | Upgrade first | Description | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Any order | The compatibility check is disabled. Consequently, you can upgrade the producers and consumers in **any order**. | +| `ALWAYS_INCOMPATIBLE` | None | The schema evolution is disabled. | +|
  • `BACKWARD`
  • `BACKWARD_TRANSITIVE`
  • | Consumers | There is no guarantee that consumers using the old schema can read data produced using the new schema. Consequently, **upgrade all consumers first**, and then start producing new data. | +|
  • `FORWARD`
  • `FORWARD_TRANSITIVE`
  • | Producers | There is no guarantee that consumers using the new schema can read data produced using the old schema. Consequently, **upgrade all producers first**
  • to use the new schema and ensure that the data already produced using the old schemas are not available to consumers, and then upgrade the consumers.
  • | +|
  • `FULL`
  • `FULL_TRANSITIVE`
  • | Any order | There is no guarantee that consumers using the old schema can read data produced using the new schema and consumers using the new schema can read data produced using the old schema. Consequently, you can upgrade the producers and consumers in **any order**. | + + + + diff --git a/site2/website/versioned_docs/version-2.8.x/schema-get-started.md b/site2/website/versioned_docs/version-2.8.x/schema-get-started.md new file mode 100644 index 0000000000000..afacb0fa51f2e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/schema-get-started.md @@ -0,0 +1,102 @@ +--- +id: schema-get-started +title: Get started +sidebar_label: "Get started" +original_id: schema-get-started +--- + +This chapter introduces Pulsar schemas and explains why they are important. + +## Schema Registry + +Type safety is extremely important in any application built around a message bus like Pulsar. + +Producers and consumers need some kind of mechanism for coordinating types at the topic level to avoid various potential problems arise. For example, serialization and deserialization issues. + +Applications typically adopt one of the following approaches to guarantee type safety in messaging. Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +#### Note +> +> Currently, the Pulsar schema registry is only available for the [Java client](client-libraries-java.md), [CGo client](client-libraries-cgo.md), [Python client](client-libraries-python.md), and [C++ client](client-libraries-cpp.md). + +### Client-side approach + +Producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. + +If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as moisture sensor readings. + +Producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. + +### Server-side approach + +Producers and consumers inform the system which data types can be transmitted via the topic. + +With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +## Why use schema + +When a schema is enabled, Pulsar does parse data, it takes bytes as inputs and sends bytes as outputs. While data has meaning beyond bytes, you need to parse data and might encounter parse exceptions which mainly occur in the following situations: + +* The field does not exist + +* The field type has changed (for example, `string` is changed to `int`) + +There are a few methods to prevent and overcome these exceptions, for example, you can catch exceptions when parsing errors, which makes code hard to maintain; or you can adopt a schema management system to perform schema evolution, not to break downstream applications, and enforces type safety to max extend in the language you are using, the solution is Pulsar Schema. + +Pulsar schema enables you to use language-specific types of data when constructing and handling messages from simple types like `string` to more complex application-specific types. + +**Example** + +You can use the _User_ class to define the messages sent to Pulsar topics. + +``` + +public class User { + String name; + int age; +} + +``` + +When constructing a producer with the _User_ class, you can specify a schema or not as below. + +### Without schema + +If you construct a producer without specifying a schema, then the producer can only produce messages of type `byte[]`. If you have a POJO class, you need to serialize the POJO into bytes before sending messages. + +**Example** + +``` + +Producer producer = client.newProducer() + .topic(topic) + .create(); +User user = new User("Tom", 28); +byte[] message = … // serialize the `user` by yourself; +producer.send(message); + +``` + +### With schema + +If you construct a producer with specifying a schema, then you can send a class to a topic directly without worrying about how to serialize POJOs into bytes. + +**Example** + +This example constructs a producer with the _JSONSchema_, and you can send the _User_ class to topics directly without worrying about how to serialize it into bytes. + +``` + +Producer producer = client.newProducer(JSONSchema.of(User.class)) + .topic(topic) + .create(); +User user = new User("Tom", 28); +producer.send(user); + +``` + +### Summary + +When constructing a producer with a schema, you do not need to serialize messages into bytes, instead Pulsar schema does this job in the background. diff --git a/site2/website/versioned_docs/version-2.8.x/schema-manage.md b/site2/website/versioned_docs/version-2.8.x/schema-manage.md new file mode 100644 index 0000000000000..c588aae619eee --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/schema-manage.md @@ -0,0 +1,639 @@ +--- +id: schema-manage +title: Manage schema +sidebar_label: "Manage schema" +original_id: schema-manage +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide demonstrates the ways to manage schemas: + +* Automatically + + * [Schema AutoUpdate](#schema-autoupdate) + +* Manually + + * [Schema manual management](#schema-manual-management) + + * [Custom schema storage](#custom-schema-storage) + +## Schema AutoUpdate + +If a schema passes the schema compatibility check, Pulsar producer automatically updates this schema to the topic it produces by default. + +### AutoUpdate for producer + +For a producer, the `AutoUpdate` happens in the following cases: + +* If a **topic doesn’t have a schema**, Pulsar registers a schema automatically. + +* If a **topic has a schema**: + + * If a **producer doesn’t carry a schema**: + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **disabled** in the namespace to which the topic belongs, the producer is allowed to connect to the topic and produce data. + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **enabled** in the namespace to which the topic belongs, the producer is rejected and disconnected. + + * If a **producer carries a schema**: + + A broker performs the compatibility check based on the configured compatibility check strategy of the namespace to which the topic belongs. + + * If the schema is registered, a producer is connected to a broker. + + * If the schema is not registered: + + * If `isAllowAutoUpdateSchema` sets to **false**, the producer is rejected to connect to a broker. + + * If `isAllowAutoUpdateSchema` sets to **true**: + + * If the schema passes the compatibility check, then the broker registers a new schema automatically for the topic and the producer is connected. + + * If the schema does not pass the compatibility check, then the broker does not register a schema and the producer is rejected to connect to a broker. + +![AutoUpdate Producer](/assets/schema-producer.png) + +### AutoUpdate for consumer + +For a consumer, the `AutoUpdate` happens in the following cases: + +* If a **consumer connects to a topic without a schema** (which means the consumer receiving raw bytes), the consumer can connect to the topic successfully without doing any compatibility check. + +* If a **consumer connects to a topic with a schema**. + + * If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + + * If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +![AutoUpdate Consumer](/assets/schema-consumer.png) + + +### Manage AutoUpdate strategy + +You can use the `pulsar-admin` command to manage the `AutoUpdate` strategy as below: + +* [Enable AutoUpdate](#enable-autoupdate) + +* [Disable AutoUpdate](#disable-autoupdate) + +* [Adjust compatibility](#adjust-compatibility) + +#### Enable AutoUpdate + +To enable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --enable tenant/namespace + +``` + +#### Disable AutoUpdate + +To disable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --disable tenant/namespace + +``` + +Once the `AutoUpdate` is disabled, you can only register a new schema using the `pulsar-admin` command. + +#### Adjust compatibility + +To adjust the schema compatibility level on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-compatibility-strategy --compatibility tenant/namespace + +``` + +### Schema validation + +By default, `schemaValidationEnforced` is **disabled** for producers: + +* This means a producer without a schema can produce any kind of messages to a topic with schemas, which may result in producing trash data to the topic. + +* This allows non-java language clients that don’t support schema can produce messages to a topic with schemas. + +However, if you want a stronger guarantee on the topics with schemas, you can enable `schemaValidationEnforced` across the whole cluster or on a per-namespace basis. + +#### Enable schema validation + +To enable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --enable tenant/namespace + +``` + +#### Disable schema validation + +To disable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --disable tenant/namespace + +``` + +## Schema manual management + +To manage schemas, you can use one of the following methods. + +| Method | Description | +| --- | --- | +| **Admin CLI**
  • | You can use the `pulsar-admin` tool to manage Pulsar schemas, brokers, clusters, sources, sinks, topics, tenants and so on. For more information about how to use the `pulsar-admin` tool, see [here](reference-pulsar-admin.md). | +| **REST API**
  • | Pulsar exposes schema related management API in Pulsar’s admin RESTful API. You can access the admin RESTful endpoint directly to manage schemas. For more information about how to use the Pulsar REST API, see [here](http://pulsar.apache.org/admin-rest-api/). | +| **Java Admin API**
  • | Pulsar provides Java admin library. | + +### Upload a schema + +To upload (register) a new schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `upload` subcommand. + +```bash + +$ pulsar-admin schemas upload --filename + +``` + +The `schema-definition-file` is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The `schema-definition-file` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here are examples of the `schema-definition-file` for a JSON schema. + +**Example 1** + +```json + +{ + "type": "JSON", + "schema": "{\"type\":\"record\",\"name\":\"User\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"file1\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"file2\",\"type\":\"string\",\"default\":null},{\"name\":\"file3\",\"type\":[\"null\",\"string\"],\"default\":\"dfdf\"}]}", + "properties": {} +} + +``` + +**Example 2** + +```json + +{ + "type": "STRING", + "schema": "", + "properties": { + "key1": "value1" + } +} + +``` + +
    + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/uploadSchema?version=@pulsar:version_number@} + +The post payload is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The post payload includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +void createSchema(String topic, PostSchemaPayload schemaPayload) + +``` + +The `PostSchemaPayload` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `PostSchemaPayload`: + +```java + +PulsarAdmin admin = …; + +PostSchemaPayload payload = new PostSchemaPayload(); +payload.setType("INT8"); +payload.setSchema(""); + +admin.createSchema("my-tenant/my-ns/my-topic", payload); + +``` + +
    + +
    +```` + +### Get a schema (latest) + +To get the latest schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get + +{ + "version": 0, + "type": "String", + "timestamp": 0, + "data": "string", + "properties": { + "property1": "string", + "property2": "string" + } +} + +``` + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic"); + +``` + +
    + +
    +```` + +### Get a schema (specific) + +To get a specific version of a schema, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get --version= + +``` + + + + +Send a `GET` request to a schema endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema/:version|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic, long version) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic", 1L); + +``` + +
    + +
    +```` + +### Extract a schema + +To provide a schema via a topic, you can use the following method. + +````mdx-code-block + + + + +Use the `extract` subcommand. + +```bash + +$ pulsar-admin schemas extract --classname --jar --type + +``` + + + + +```` + +### Delete a schema + +To delete a schema for a topic, you can use one of the following methods. + +:::note + +In any case, the **delete** action deletes **all versions** of a schema registered for a topic. + +::: + +````mdx-code-block + + + + +Use the `delete` subcommand. + +```bash + +$ pulsar-admin schemas delete + +``` + + + + +Send a `DELETE` request to a schema endpoint: {@inject: endpoint|DELETE|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/deleteSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", +} + +``` + +The response includes the following field: + +Field | Description | +---|---| +`version` | The schema version, which is a long number. | + + + + +```java + +void deleteSchema(String topic) + +``` + +Here is an example of deleting a schema. + +```java + +PulsarAdmin admin = …; + +admin.deleteSchema("my-tenant/my-ns/my-topic"); + +``` + + + + +```` + +## Custom schema storage + +By default, Pulsar stores various data types of schemas in [Apache BookKeeper](https://bookkeeper.apache.org) deployed alongside Pulsar. + +However, you can use another storage system if needed. + +### Implement + +To use a non-default (non-BookKeeper) storage system for Pulsar schemas, you need to implement the following Java interfaces: + +* [SchemaStorage interface](#schemastorage-interface) + +* [SchemaStorageFactory interface](#schemastoragefactory-interface) + +#### SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage** implementation, see [BookKeeperSchemaStorage](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +::: + +#### SchemaStorageFactory interface + +The `SchemaStorageFactory` interface has the following method: + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage factory** implementation, see [BookKeeperSchemaStorageFactory](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +::: + +### Deploy + +To use your custom schema storage implementation, perform the following steps. + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. + +2. Add the JAR file to the `lib` folder in your Pulsar binary or source distribution. + +3. Change the `schemaRegistryStorageClassName` configuration in `broker.conf` to your custom factory class. + +4. Start Pulsar. diff --git a/site2/website/versioned_docs/version-2.8.x/schema-understand.md b/site2/website/versioned_docs/version-2.8.x/schema-understand.md new file mode 100644 index 0000000000000..a86b02add435e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/schema-understand.md @@ -0,0 +1,556 @@ +--- +id: schema-understand +title: Understand schema +sidebar_label: "Understand schema" +original_id: schema-understand +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This chapter explains the basic concepts of Pulsar schema, focuses on the topics of particular importance, and provides additional background. + +## SchemaInfo + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +The `SchemaInfo` is stored and enforced on a per-topic basis and cannot be stored at the namespace or tenant level. + +A `SchemaInfo` consists of the following fields: + +| Field | Description | +| --- | --- | +| `name` | Schema name (a string). | +| `type` | Schema type, which determines how to interpret the schema data.
  • Predefined schema: see [here](schema-understand.md#schema-type).
  • Customized schema: it is left as an empty string.
  • | +| `schema`(`payload`) | Schema data, which is a sequence of 8-bit unsigned bytes and schema-type specific. | +| `properties` | It is a user defined properties as a string/string map. Applications can use this bag for carrying any application specific logics. Possible properties might be the Git hash associated with the schema, an environment string like `dev` or `prod`. | + +**Example** + +This is the `SchemaInfo` of a string. + +```json + +{ + "name": "test-string-schema", + "type": "STRING", + "schema": "", + "properties": {} +} + +``` + +## Schema type + +Pulsar supports various schema types, which are mainly divided into two categories: + +* Primitive type + +* Complex type + +### Primitive type + +Currently, Pulsar supports the following primitive types: + +| Primitive Type | Description | +|---|---| +| `BOOLEAN` | A binary value | +| `INT8` | A 8-bit signed integer | +| `INT16` | A 16-bit signed integer | +| `INT32` | A 32-bit signed integer | +| `INT64` | A 64-bit signed integer | +| `FLOAT` | A single precision (32-bit) IEEE 754 floating-point number | +| `DOUBLE` | A double-precision (64-bit) IEEE 754 floating-point number | +| `BYTES` | A sequence of 8-bit unsigned bytes | +| `STRING` | A Unicode character sequence | +| `TIMESTAMP` (`DATE`, `TIME`) | A logic type represents a specific instant in time with millisecond precision.
    It stores the number of milliseconds since `January 1, 1970, 00:00:00 GMT` as an `INT64` value | +| INSTANT | A single instantaneous point on the time-line with nanoseconds precision| +| LOCAL_DATE | An immutable date-time object that represents a date, often viewed as year-month-day| +| LOCAL_TIME | An immutable date-time object that represents a time, often viewed as hour-minute-second. Time is represented to nanosecond precision.| +| LOCAL_DATE_TIME | An immutable date-time object that represents a date-time, often viewed as year-month-day-hour-minute-second | + +For primitive types, Pulsar does not store any schema data in `SchemaInfo`. The `type` in `SchemaInfo` is used to determine how to serialize and deserialize the data. + +Some of the primitive schema implementations can use `properties` to store implementation-specific tunable settings. For example, a `string` schema can use `properties` to store the encoding charset to serialize and deserialize strings. + +The conversions between **Pulsar schema types** and **language-specific primitive types** are as below. + +| Schema Type | Java Type| Python Type | Go Type | +|---|---|---|---| +| BOOLEAN | boolean | bool | bool | +| INT8 | byte | | int8 | +| INT16 | short | | int16 | +| INT32 | int | | int32 | +| INT64 | long | | int64 | +| FLOAT | float | float | float32 | +| DOUBLE | double | float | float64| +| BYTES | byte[], ByteBuffer, ByteBuf | bytes | []byte | +| STRING | string | str | string| +| TIMESTAMP | java.sql.Timestamp | | | +| TIME | java.sql.Time | | | +| DATE | java.util.Date | | | +| INSTANT | java.time.Instant | | | +| LOCAL_DATE | java.time.LocalDate | | | +| LOCAL_TIME | java.time.LocalDateTime | | +| LOCAL_DATE_TIME | java.time.LocalTime | | + +**Example** + +This example demonstrates how to use a string schema. + +1. Create a producer with a string schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.STRING).create(); + producer.newMessage().value("Hello Pulsar!").send(); + + ``` + +2. Create a consumer with a string schema and receive messages. + + ```java + + Consumer consumer = client.newConsumer(Schema.STRING).subscribe(); + consumer.receive(); + + ``` + +### Complex type + +Currently, Pulsar supports the following complex types: + +| Complex Type | Description | +|---|---| +| `keyvalue` | Represents a complex type of a key/value pair. | +| `struct` | Handles structured data. It supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. | + +#### keyvalue + +`Keyvalue` schema helps applications define schemas for both key and value. + +For `SchemaInfo` of `keyvalue` schema, Pulsar stores the `SchemaInfo` of key schema and the `SchemaInfo` of value schema together. + +Pulsar provides the following methods to encode a key/value pair in messages: + +* `INLINE` + +* `SEPARATED` + +You can choose the encoding type when constructing the key/value schema. + +````mdx-code-block + + + + +Key/value pairs are encoded together in the message payload. + + + + +Key is encoded in the message key and the value is encoded in the message payload. + +**Example** + +This example shows how to construct a key/value schema and then use it to produce and consume messages. + +1. Construct a key/value schema with `INLINE` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.INLINE + ); + + ``` + +2. Optionally, construct a key/value schema with `SEPARATED` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + ``` + +3. Produce messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Producer> producer = client.newProducer(kvSchema) + .topic(TOPIC) + .create(); + + final int key = 100; + final String value = "value-100"; + + // send the key/value message + producer.newMessage() + .value(new KeyValue(key, value)) + .send(); + + ``` + +4. Consume messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Consumer> consumer = client.newConsumer(kvSchema) + ... + .topic(TOPIC) + .subscriptionName(SubscriptionName).subscribe(); + + // receive key/value pair + Message> msg = consumer.receive(); + KeyValue kv = msg.getValue(); + + ``` + + + + +```` + +#### struct + +This section describes the details of type and usage of the `struct` schema. + +##### Type + +`struct` schema supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. + +|Type|Description| +---|---| +`AvroBaseStructSchema`|Pulsar uses [Avro Specification](http://avro.apache.org/docs/current/spec.html) to declare the schema definition for `AvroBaseStructSchema`, which supports `AvroSchema`, `JsonSchema`, and `ProtobufSchema`.

    This allows Pulsar:
    - to use the same tools to manage schema definitions
    - to use different serialization or deserialization methods to handle data| +`ProtobufNativeSchema`|`ProtobufNativeSchema` is based on protobuf native Descriptor.

    This allows Pulsar:
    - to use native protobuf-v3 to serialize or deserialize data
    - to use `AutoConsume` to deserialize data. + +##### Usage + +Pulsar provides the following methods to use the `struct` schema: + +* `static` + +* `generic` + +* `SchemaDefinition` + +````mdx-code-block + + + + +You can predefine the `struct` schema, which can be a POJO in Java, a `struct` in Go, or classes generated by Avro or Protobuf tools. + +**Example** + +Pulsar gets the schema definition from the predefined `struct` using an Avro library. The schema definition is the schema data stored as a part of the `SchemaInfo`. + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `struct` schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.AVRO(User.class)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `struct` schema and receive messages + + ```java + + Consumer consumer = client.newConsumer(Schema.AVRO(User.class)).subscribe(); + User user = consumer.receive(); + + ``` + + + + +Sometimes applications do not have pre-defined structs, and you can use this method to define schema and access data. + +You can define the `struct` schema using the `GenericSchemaBuilder`, generate a generic struct using `GenericRecordBuilder` and consume messages into `GenericRecord`. + +**Example** + +1. Use `RecordSchemaBuilder` to build a schema. + + ```java + + RecordSchemaBuilder recordSchemaBuilder = SchemaBuilder.record("schemaName"); + recordSchemaBuilder.field("intField").type(SchemaType.INT32); + SchemaInfo schemaInfo = recordSchemaBuilder.build(SchemaType.AVRO); + + Producer producer = client.newProducer(Schema.generic(schemaInfo)).create(); + + ``` + +2. Use `RecordBuilder` to build the struct records. + + ```java + + producer.newMessage().value(schema.newRecordBuilder() + .set("intField", 32) + .build()).send(); + + ``` + + + + +You can define the `schemaDefinition` to generate a `struct` schema. + +**Example** + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `SchemaDefinition` and send messages. + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Producer producer = client.newProducer(Schema.AVRO(schemaDefinition)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `SchemaDefinition` schema and receive messages + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Consumer consumer = client.newConsumer(Schema.AVRO(schemaDefinition)).subscribe(); + User user = consumer.receive().getValue(); + + ``` + + + + +```` + +### Auto Schema + +If you don't know the schema type of a Pulsar topic in advance, you can use AUTO schema to produce or consume generic records to or from brokers. + +| Auto Schema Type | Description | +|---|---| +| `AUTO_PRODUCE` | This is useful for transferring data **from a producer to a Pulsar topic that has a schema**. | +| `AUTO_CONSUME` | This is useful for transferring data **from a Pulsar topic that has a schema to a consumer**. | + +#### AUTO_PRODUCE + +`AUTO_PRODUCE` schema helps a producer validate whether the bytes sent by the producer is compatible with the schema of a topic. + +**Example** + +Suppose that: + +* You have a producer processing messages from a Kafka topic _K_. + +* You have a Pulsar topic _P_, and you do not know its schema type. + +* Your application reads the messages from _K_ and writes the messages to _P_. + +In this case, you can use `AUTO_PRODUCE` to verify whether the bytes produced by _K_ can be sent to _P_ or not. + +```java + +Produce pulsarProducer = client.newProducer(Schema.AUTO_PRODUCE()) + … + .create(); + +byte[] kafkaMessageBytes = … ; + +pulsarProducer.produce(kafkaMessageBytes); + +``` + +#### AUTO_CONSUME + +`AUTO_CONSUME` schema helps a Pulsar topic validate whether the bytes sent by a Pulsar topic is compatible with a consumer, that is, the Pulsar topic deserializes messages into language-specific objects using the `SchemaInfo` retrieved from broker-side. + +Currently, `AUTO_CONSUME` supports AVRO, JSON and ProtobufNativeSchema schemas. It deserializes messages into `GenericRecord`. + +**Example** + +Suppose that: + +* You have a Pulsar topic _P_. + +* You have a consumer (for example, MySQL) receiving messages from the topic _P_. + +* Your application reads the messages from _P_ and writes the messages to MySQL. + +In this case, you can use `AUTO_CONSUME` to verify whether the bytes produced by _P_ can be sent to MySQL or not. + +```java + +Consumer pulsarConsumer = client.newConsumer(Schema.AUTO_CONSUME()) + … + .subscribe(); + +Message msg = consumer.receive() ; +GenericRecord record = msg.getValue(); + +``` + +## Schema version + +Each `SchemaInfo` stored with a topic has a version. Schema version manages schema changes happening within a topic. + +Messages produced with a given `SchemaInfo` is tagged with a schema version, so when a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and then use the `SchemaInfo` to deserialize data. + +Schemas are versioned in succession. Schema storage happens in a broker that handles the associated topics so that version assignments can be made. + +Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + +**Example** + +The following example illustrates how the schema version works. + +Suppose that a Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begins to send messages: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); + +``` + +The table below lists the possible scenarios when this connection attempt occurs and what happens in each scenario: + +| Scenario | What happens | +| --- | --- | +|
  • No schema exists for the topic.
  • | (1) The producer is created using the given schema. (2) Since no existing schema is compatible with the `SensorReading` schema, the schema is transmitted to the broker and stored. (3) Any consumer created using the same schema or topic can consume messages from the `sensor-data` topic. | +|
  • A schema already exists.
  • The producer connects using the same schema that is already stored.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible. (3) The broker attempts to store the schema in [BookKeeper](concepts-architecture-overview.md#persistent-storage) but then determines that it's already stored, so it is used to tag produced messages. |
  • A schema already exists.
  • The producer connects using a new schema that is compatible.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). | + +## How does schema work + +Pulsar schemas are applied and enforced at the **topic** level (schemas cannot be applied at the namespace or tenant level). + +Producers and consumers upload schemas to brokers, so Pulsar schemas work on the producer side and the consumer side. + +### Producer side + +This diagram illustrates how does schema work on the Producer side. + +![Schema works at the producer side](/assets/schema-producer.png) + +1. The application uses a schema instance to construct a producer instance. + + The schema instance defines the schema for the data being produced using the producer instance. + + Take AVRO as an example, Pulsar extracts schema definition from the POJO class and constructs the `SchemaInfo` that the producer needs to pass to a broker when it connects. + +2. The producer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the producer. + +5. If no, the broker verifies whether a schema can be automatically created in this namespace: + + * If `isAllowAutoUpdateSchema` sets to **true**, then a schema can be created, and the broker validates the schema based on the schema compatibility check strategy defined for the topic. + + * If `isAllowAutoUpdateSchema` sets to **false**, then a schema can not be created, and the producer is rejected to connect to the broker. + +**Tip**: + +`isAllowAutoUpdateSchema` can be set via **Pulsar admin API** or **REST API.** + +For how to set `isAllowAutoUpdateSchema` via Pulsar admin API, see [Manage AutoUpdate Strategy](schema-manage.md/#manage-autoupdate-strategy). + +6. If the schema is allowed to be updated, then the compatible strategy check is performed. + + * If the schema is compatible, the broker stores it and returns the schema version to the producer. + + All the messages produced by this producer are tagged with the schema version. + + * If the schema is incompatible, the broker rejects it. + +### Consumer side + +This diagram illustrates how does Schema work on the consumer side. + +![Schema works at the consumer side](/assets/schema-consumer.png) + +1. The application uses a schema instance to construct a consumer instance. + + The schema instance defines the schema that the consumer uses for decoding messages received from a broker. + +2. The consumer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker determines whether the topic has one of them (a schema/data/a local consumer and a local producer). + +4. If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + +5. If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +6. The consumer receives messages from the broker. + + If the schema used by the consumer supports schema versioning (for example, AVRO schema), the consumer fetches the `SchemaInfo` of the version tagged in messages and uses the passed-in schema and the schema tagged in messages to decode the messages. diff --git a/site2/website/versioned_docs/version-2.8.x/security-athenz.md b/site2/website/versioned_docs/version-2.8.x/security-athenz.md new file mode 100644 index 0000000000000..8a39fe25316d0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-athenz.md @@ -0,0 +1,98 @@ +--- +id: security-athenz +title: Authentication using Athenz +sidebar_label: "Authentication using Athenz" +original_id: security-athenz +--- + +[Athenz](https://github.com/AthenZ/athenz) is a role-based authentication/authorization system. In Pulsar, you can use Athenz role tokens (also known as *z-tokens*) to establish the identify of the client. + +## Athenz authentication settings + +A [decentralized Athenz system](https://github.com/AthenZ/athenz/blob/master/docs/decent_authz_flow.md) contains an [authori**Z**ation **M**anagement **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zts) (ZTS) server. + +To begin, you need to set up Athenz service access control. You need to create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a [tenant](reference-terminology.md#tenant) in Pulsar). + +### Create the tenant domain and service + +On the [tenant](reference-terminology.md#tenant) side, you need to do the following things: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that you need to specify the private key generated in step 2 when the Pulsar client connects to the [broker](reference-terminology.md#broker) (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to do the following things: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that you can specify any action and resource in step 2 since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +## Configure the broker for Athenz + +> ### TLS encryption +> +> Note that when you are using Athenz as an authentication provider, you had better use TLS encryption +> as it can protect role tokens from being intercepted and reused. (for more details involving TLS encryption see [Architecture - Data Model](https://github.com/AthenZ/athenz/blob/master/docs/data_model)). + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties + +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +brokerClientAuthenticationParameters={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +``` + +> A full listing of parameters is available in the `conf/broker.conf` file, you can also find the default +> values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +## Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +## Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following authentication parameters to the `conf/client.conf` config file to use Athenz with CLI tools of Pulsar: + +```properties + +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/security-authorization.md b/site2/website/versioned_docs/version-2.8.x/security-authorization.md new file mode 100644 index 0000000000000..7ac09b6f439ea --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-authorization.md @@ -0,0 +1,114 @@ +--- +id: security-authorization +title: Authentication and authorization in Pulsar +sidebar_label: "Authorization and ACLs" +original_id: security-authorization +--- + + +In Pulsar, the [authentication provider](security-overview.md#authentication-providers) is responsible for properly identifying clients and associating the clients with [role tokens](security-overview.md#role-tokens). If you only enable authentication, an authenticated role token has the ability to access all resources in the cluster. *Authorization* is the process that determines *what* clients are able to do. + +The role tokens with the most privileges are the *superusers*. The *superusers* can create and destroy tenants, along with having full access to all tenant resources. + +When a superuser creates a [tenant](reference-terminology.md#tenant), that tenant is assigned an admin role. A client with the admin role token can then create, modify and destroy namespaces, and grant and revoke permissions to *other role tokens* on those namespaces. + +## Broker and Proxy Setup + +### Enable authorization and assign superusers +You can enable the authorization and assign the superusers in the broker ([`conf/broker.conf`](reference-configuration.md#broker)) configuration files. + +```properties + +authorizationEnabled=true +superUserRoles=my-super-user-1,my-super-user-2 + +``` + +> A full list of parameters is available in the `conf/broker.conf` file. +> You can also find the default values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +Typically, you use superuser roles for administrators, clients as well as broker-to-broker authorization. When you use [geo-replication](concepts-replication.md), every broker needs to be able to publish to all the other topics of clusters. + +You can also enable the authorization for the proxy in the proxy configuration file (`conf/proxy.conf`). Once you enable the authorization on the proxy, the proxy does an additional authorization check before forwarding the request to a broker. +If you enable authorization on the broker, the broker checks the authorization of the request when the broker receives the forwarded request. + +### Proxy Roles + +By default, the broker treats the connection between a proxy and the broker as a normal user connection. The broker authenticates the user as the role configured in `proxy.conf`(see ["Enable TLS Authentication on Proxies"](security-tls-authentication.md#enable-tls-authentication-on-proxies)). However, when the user connects to the cluster through a proxy, the user rarely requires the authentication. The user expects to be able to interact with the cluster as the role for which they have authenticated with the proxy. + +Pulsar uses *Proxy roles* to enable the authentication. Proxy roles are specified in the broker configuration file, [`conf/broker.conf`](reference-configuration.md#broker). If a client that is authenticated with a broker is one of its ```proxyRoles```, all requests from that client must also carry information about the role of the client that is authenticated with the proxy. This information is called the *original principal*. If the *original principal* is absent, the client is not able to access anything. + +You must authorize both the *proxy role* and the *original principal* to access a resource to ensure that the resource is accessible via the proxy. Administrators can take two approaches to authorize the *proxy role* and the *original principal*. + +The more secure approach is to grant access to the proxy roles each time you grant access to a resource. For example, if you have a proxy role named `proxy1`, when the superuser creates a tenant, you should specify `proxy1` as one of the admin roles. When a role is granted permissions to produce or consume from a namespace, if that client wants to produce or consume through a proxy, you should also grant `proxy1` the same permissions. + +Another approach is to make the proxy role a superuser. This allows the proxy to access all resources. The client still needs to authenticate with the proxy, and all requests made through the proxy have their role downgraded to the *original principal* of the authenticated client. However, if the proxy is compromised, a bad actor could get full access to your cluster. + +You can specify the roles as proxy roles in [`conf/broker.conf`](reference-configuration.md#broker). + +```properties + +proxyRoles=my-proxy-role + +# if you want to allow superusers to use the proxy (see above) +superUserRoles=my-super-user-1,my-super-user-2,my-proxy-role + +``` + +## Administer tenants + +Pulsar [instance](reference-terminology.md#instance) administrators or some kind of self-service portal typically provisions a Pulsar [tenant](reference-terminology.md#tenant). + +You can manage tenants using the [`pulsar-admin`](reference-pulsar-admin.md) tool. + +### Create a new tenant + +The following is an example tenant creation command: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east + +``` + +This command creates a new tenant `my-tenant` that is allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identifies itself as having the role `my-admin-role` is allowed to perform all administrative tasks on this tenant. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and namespaces: + +```shell + +persistent://tenant/namespace/topic + +``` + +### Manage permissions + +You can use [Pulsar Admin Tools](admin-api-permissions.md) for managing permission in Pulsar. + +### Pulsar admin authentication + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("http://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .build(); + +``` + +To use TLS: + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("https://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .tlsTrustCertsFilePath("/path/to/trust/cert") + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/security-basic-auth.md b/site2/website/versioned_docs/version-2.8.x/security-basic-auth.md new file mode 100644 index 0000000000000..5cce10fdc3fb0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-basic-auth.md @@ -0,0 +1,155 @@ +--- +id: security-basic-auth +title: Authentication using HTTP basic +sidebar_label: "Authentication using HTTP basic" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +[Basic authentication](https://en.wikipedia.org/wiki/Basic_access_authentication) is a simple authentication scheme built into the HTTP protocol, which uses base64-encoded username and password pairs as credentials. + +## Prerequisites + +Install [`htpasswd`](https://httpd.apache.org/docs/2.4/programs/htpasswd.html) in your environment to create a password file for storing username-password pairs. + +* For Ubuntu/Debian, run the following command to install `htpasswd`. + + ``` + apt install apache2-utils + ``` + +* For CentOS/RHEL, run the following command to install `htpasswd`. + + ``` + yum install httpd-tools + ``` + +## Create your authentication file + +:::note + +Currently, you can use MD5 (recommended) and CRYPT encryption to authenticate your password. + +::: + +Create a password file named `.htpasswd` with a user account `superuser/admin`: +* Use MD5 encryption (recommended): + + ``` + htpasswd -cmb /path/to/.htpasswd superuser admin + ``` + +* Use CRYPT encryption: + + ``` + htpasswd -cdb /path/to/.htpasswd superuser admin + ``` + +You can preview the content of your password file by running the following command: + +``` +cat path/to/.htpasswd +superuser:$apr1$GBIYZYFZ$MzLcPrvoUky16mLcK6UtX/ +``` + +## Enable basic authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to the `conf/broker.conf` file. If you use a standalone Pulsar, you need to add these parameters to the `conf/standalone.conf` file. + +```conf +# Configuration to enable Basic authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Enable basic authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to the `conf/proxy.conf` file. + +```conf +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Configure basic authentication in CLI tools + +[Command-line tools](/docs/next/reference-cli-tools), such as [Pulsar-admin](/tools/pulsar-admin/), [Pulsar-perf](/tools/pulsar-perf/) and [Pulsar-client](/tools/pulsar-client/), use the `conf/client.conf` file in your Pulsar installation. To configure basic authentication in Pulsar CLI tools, you need to add the following parameters to the `conf/client.conf` file. + +```conf +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +authParams={"userId":"superuser","password":"admin"} +``` + + +## Configure basic authentication in Pulsar clients + +The following example shows how to configure basic authentication when using Pulsar clients. + + + + + ```java + AuthenticationBasic auth = new AuthenticationBasic(); + auth.configure("{\"userId\":\"superuser\",\"password\":\"admin\"}"); + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650") + .authentication(auth) + .build(); + ``` + + + + + ```c++ + #include + + int main() { + pulsar::ClientConfiguration config; + AuthenticationPtr auth = pulsar::AuthBasic::create("admin", "123456") + config.setAuth(auth); + pulsar::Client client("pulsar://broker.example.com:6650/", config); + + return 0; + } + ``` + + + diff --git a/site2/website/versioned_docs/version-2.8.x/security-bouncy-castle.md b/site2/website/versioned_docs/version-2.8.x/security-bouncy-castle.md new file mode 100644 index 0000000000000..be937055d8e31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-bouncy-castle.md @@ -0,0 +1,157 @@ +--- +id: security-bouncy-castle +title: Bouncy Castle Providers +sidebar_label: "Bouncy Castle Providers" +original_id: security-bouncy-castle +--- + +## BouncyCastle Introduce + +`Bouncy Castle` is a Java library that complements the default Java Cryptographic Extension (JCE), +and it provides more cipher suites and algorithms than the default JCE provided by Sun. + +In addition to that, `Bouncy Castle` has lots of utilities for reading arcane formats like PEM and ASN.1 that no sane person would want to rewrite themselves. + +In Pulsar, security and crypto have dependencies on BouncyCastle Jars. For the detailed installing and configuring Bouncy Castle FIPS, see [BC FIPS Documentation](https://www.bouncycastle.org/documentation.html), especially the **User Guides** and **Security Policy** PDFs. + +`Bouncy Castle` provides both [FIPS](https://www.bouncycastle.org/fips_faq.html) and non-FIPS version. But in a JVM, you can not include both of the 2 versions, and you need to exclude the current version before include the other. + +In Pulsar, the security and crypto methods also depends on `Bouncy Castle`, especially in [TLS Authentication](security-tls-authentication.md) and [Transport Encryption](security-encryption.md). This document contains the configuration between BouncyCastle FIPS(BC-FIPS) and non-FIPS(BC-non-FIPS) version while using Pulsar. + +## How BouncyCastle modules packaged in Pulsar + +In Pulsar's `bouncy-castle` module, We provide 2 sub modules: `bouncy-castle-bc`(for non-FIPS version) and `bouncy-castle-bcfips`(for FIPS version), to package BC jars together to make the include and exclude of `Bouncy Castle` easier. + +To achieve this goal, we will need to package several `bouncy-castle` jars together into `bouncy-castle-bc` or `bouncy-castle-bcfips` jar. +Each of the original bouncy-castle jar is related with security, so BouncyCastle dutifully supplies signed of each JAR. +But when we do the re-package, Maven shade explodes the BouncyCastle jar file which puts the signatures into META-INF, +these signatures aren't valid for this new, uber-jar (signatures are only for the original BC jar). +Usually, You will meet error like `java.lang.SecurityException: Invalid signature file digest for Manifest main attributes`. + +You could exclude these signatures in mvn pom file to avoid above error, by + +```access transformers + +META-INF/*.SF +META-INF/*.DSA +META-INF/*.RSA + +``` + +But it can also lead to new, cryptic errors, e.g. `java.security.NoSuchAlgorithmException: PBEWithSHA256And256BitAES-CBC-BC SecretKeyFactory not available` +By explicitly specifying where to find the algorithm like this: `SecretKeyFactory.getInstance("PBEWithSHA256And256BitAES-CBC-BC","BC")` +It will get the real error: `java.security.NoSuchProviderException: JCE cannot authenticate the provider BC` + +So, we used a [executable packer plugin](https://github.com/nthuemmel/executable-packer-maven-plugin) that uses a jar-in-jar approach to preserve the BouncyCastle signature in a single, executable jar. + +### Include dependencies of BC-non-FIPS + +Pulsar module `bouncy-castle-bc`, which defined by `bouncy-castle/bc/pom.xml` contains the needed non-FIPS jars for Pulsar, and packaged as a jar-in-jar(need to provide `pkg`). + +```xml + + + org.bouncycastle + bcpkix-jdk15on + ${bouncycastle.version} + + + + org.bouncycastle + bcprov-ext-jdk15on + ${bouncycastle.version} + + +``` + +By using this `bouncy-castle-bc` module, you can easily include and exclude BouncyCastle non-FIPS jars. + +### Modules that include BC-non-FIPS module (`bouncy-castle-bc`) + +For Pulsar client, user need the bouncy-castle module, so `pulsar-client-original` will include the `bouncy-castle-bc` module, and have `pkg` set to reference the `jar-in-jar` package. +It is included as following example: + +```xml + + + org.apache.pulsar + bouncy-castle-bc + ${pulsar.version} + pkg + + +``` + +By default `bouncy-castle-bc` already included in `pulsar-client-original`, And `pulsar-client-original` has been included in a lot of other modules like `pulsar-client-admin`, `pulsar-broker`. +But for the above shaded jar and signatures reason, we should not package Pulsar's `bouncy-castle` module into `pulsar-client-all` other shaded modules directly, such as `pulsar-client-shaded`, `pulsar-client-admin-shaded` and `pulsar-broker-shaded`. +So in the shaded modules, we will exclude the `bouncy-castle` modules. + +```xml + + + + org.apache.pulsar:pulsar-client-original + + ** + + + org/bouncycastle/** + + + + +``` + +That means, `bouncy-castle` related jars are not shaded in these fat jars. + +### Module BC-FIPS (`bouncy-castle-bcfips`) + +Pulsar module `bouncy-castle-bcfips`, which defined by `bouncy-castle/bcfips/pom.xml` contains the needed FIPS jars for Pulsar. +Similar to `bouncy-castle-bc`, `bouncy-castle-bcfips` also packaged as a `jar-in-jar` package for easy include/exclude. + +```xml + + + org.bouncycastle + bc-fips + ${bouncycastlefips.version} + + + + org.bouncycastle + bcpkix-fips + ${bouncycastlefips.version} + + +``` + +### Exclude BC-non-FIPS and include BC-FIPS + +If you want to switch from BC-non-FIPS to BC-FIPS version, Here is an example for `pulsar-broker` module: + +```xml + + + org.apache.pulsar + pulsar-broker + ${pulsar.version} + + + org.apache.pulsar + bouncy-castle-bc + + + + + + org.apache.pulsar + bouncy-castle-bcfips + ${pulsar.version} + pkg + + +``` + + +For more example, you can reference module `bcfips-include-test`. + diff --git a/site2/website/versioned_docs/version-2.8.x/security-encryption.md b/site2/website/versioned_docs/version-2.8.x/security-encryption.md new file mode 100644 index 0000000000000..c2f3530d94d9e --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-encryption.md @@ -0,0 +1,200 @@ +--- +id: security-encryption +title: Pulsar Encryption +sidebar_label: "End-to-End Encryption" +original_id: security-encryption +--- + +Applications can use Pulsar encryption to encrypt messages on the producer side and decrypt messages on the consumer side. You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +## Asymmetric and symmetric encryption + +Pulsar uses a dynamically generated symmetric AES key to encrypt messages(data). You can use the application-provided ECDSA (Elliptic Curve Digital Signature Algorithm) or RSA (Rivest–Shamir–Adleman) key pair to encrypt the AES key(data key), so you do not have to share the secret with everyone. + +Key is a public and private key pair used for encryption or decryption. The producer key is the public key of the key pair, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. You can use this key to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key (in this case the consumer) are able to decrypt the data key which is used to decrypt the message. + +You can encrypt a message with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message. + +Pulsar does not store the encryption key anywhere in the Pulsar service. If you lose or delete the private key, your message is irretrievably lost, and is unrecoverable. + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Get started + +1. Create your ECDSA or RSA public and private key pair by using the following commands. + * ECDSA(for Java clients only) + + ```shell + + openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem + openssl ec -in test_ecdsa_privkey.pem -pubout -outform pem -out test_ecdsa_pubkey.pem + + ``` + + * RSA (for C++, Python and Node.js clients) + + ```shell + + openssl genrsa -out test_rsa_privkey.pem 2048 + openssl rsa -in test_rsa_privkey.pem -pubout -outform pkcs8 -out test_rsa_pubkey.pem + + ``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. + +3. Implement the `CryptoKeyReader` interface, specifically `CryptoKeyReader.getPublicKey()` for producer and `CryptoKeyReader.getPrivateKey()` for consumer, which Pulsar client invokes to load the key. + +4. Add the encryption key name to the producer builder: PulsarClient.newProducer().addEncryptionKey("myapp.key"). + +5. Add CryptoKeyReader implementation to producer or consumer builder: PulsarClient.newProducer().cryptoKeyReader(keyReader) / PulsarClient.newConsumer().cryptoKeyReader(keyReader). + +6. Sample producer application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + +Producer producer = pulsarClient.newProducer() + .topic("persistent://my-tenant/my-ns/my-topic") + .addEncryptionKey("myappkey") + .cryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")) + .create(); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +producer.close(); +pulsarClient.close(); + +``` + +7. Sample Consumer Application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); +Consumer consumer = pulsarClient.newConsumer() + .topic("persistent://my-tenant/my-ns/my-topic") + .subscriptionName("my-subscriber-name") + .cryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")) + .subscribe(); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +consumer.close(); +pulsarClient.close(); + +``` + +## Key rotation +Pulsar generates a new AES data key every 4 hours or after publishing a certain number of messages. A producer fetches the asymmetric public key every 4 hours by calling CryptoKeyReader.getPublicKey() to retrieve the latest version. + +## Enable encryption at the producer application +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. You can do this in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys. +2. You grant access to one of the private keys from the pairs that producer uses. + +When producers want to encrypt the messages with multiple keys, producers add all such keys to the config. Consumer can decrypt the message as long as the consumer has access to at least one of the keys. + +If you need to encrypt the messages using 2 keys (`myapp.messagekey1` and `myapp.messagekey2`), refer to the following example. + +```java + +PulsarClient.newProducer().addEncryptionKey("myapp.messagekey1").addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypt encrypted messages at the consumer application +Consumers require to access one of the private keys to decrypt messages that the producer produces. If you want to receive encrypted messages, create a public or private key and give your public key to the producer application to encrypt messages using your public key. + +## Handle failures +* Producer/Consumer loses access to the key + * Producer action fails to indicate the cause of the failure. Application has the option to proceed with sending unencrypted messages in such cases. Call `PulsarClient.newProducer().cryptoFailureAction(ProducerCryptoFailureAction)` to control the producer behavior. The default behavior is to fail the request. + * If consumption fails due to decryption failure or missing keys in consumer, the application has the option to consume the encrypted message or discard it. Call `PulsarClient.newConsumer().cryptoFailureAction(ConsumerCryptoFailureAction)` to control the consumer behavior. The default behavior is to fail the request. Application is never able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contains batch messages, client is not able to retrieve individual messages in the batch, hence message consumption fails even if cryptoFailureAction() is set to `ConsumerCryptoFailureAction.CONSUME`. +* If decryption fails, the message consumption stops and the application notices backlog growth in addition to decryption failure messages in the client log. If the application does not have access to the private key to decrypt the message, the only option is to skip or discard backlogged messages. diff --git a/site2/website/versioned_docs/version-2.8.x/security-extending.md b/site2/website/versioned_docs/version-2.8.x/security-extending.md new file mode 100644 index 0000000000000..e7484453b8beb --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-extending.md @@ -0,0 +1,207 @@ +--- +id: security-extending +title: Extending Authentication and Authorization in Pulsar +sidebar_label: "Extending" +original_id: security-extending +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms. + +## Authentication + +Pulsar supports mutual TLS and Athenz authentication plugins. For how to use these authentication plugins, you can refer to the description in [Security](security-overview.md). + +You can use a custom authentication mechanism by providing the implementation in the form of two plugins. One plugin is for the Client library and the other plugin is for the Pulsar Proxy and/or Pulsar Broker to validate the credentials. + +### Client authentication plugin + +For the client library, you need to implement `org.apache.pulsar.client.api.Authentication`. By entering the command below you can pass this class when you create a Pulsar client: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); + +``` + +You can use 2 interfaces to implement on the client side: + * `Authentication` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/Authentication.html + * `AuthenticationDataProvider` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html + + +This in turn needs to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider`. This leaves the chance to return different kinds of authentication token for different types of connection or by passing a certificate chain to use for TLS. + + +You can find examples for client authentication providers at: + + * Mutual TLS Auth -- https://github.com/apache/pulsar/tree/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth + * Athenz -- https://github.com/apache/pulsar/tree/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth + +### Proxy/Broker authentication plugin + +On the proxy/broker side, you need to configure the corresponding plugin to validate the credentials that the client sends. The Proxy and Broker can support multiple authentication providers at the same time. + +In `conf/broker.conf` you can choose to specify a list of valid providers: + +```properties + +# Authentication provider name list, which is comma separated list of class names +authenticationProviders= + +``` + +To implement `org.apache.pulsar.broker.authentication.AuthenticationProvider` on one single interface: + +```java + +/** + * Provider of authentication mechanism + */ +public interface AuthenticationProvider extends Closeable { + + /** + * Perform initialization for the authentication provider + * + * @param config + * broker config object + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration config) throws IOException; + + /** + * @return the authentication method name supported by this provider + */ + String getAuthMethodName(); + + /** + * Validate the authentication for the given credentials with the specified authentication data + * + * @param authData + * provider specific authentication data + * @return the "role" string for the authenticated connection, if the authentication was successful + * @throws AuthenticationException + * if the credentials are not valid + */ + String authenticate(AuthenticationDataSource authData) throws AuthenticationException; + +} + +``` + +The following is the example for Broker authentication plugins: + + * Mutual TLS -- https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java + * Athenz -- https://github.com/apache/pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" has permission to perform a certain operation. + +By default, you can use the embedded authorization provider provided by Pulsar. You can also configure a different authorization provider through a plugin. +Note that although the Authentication plugin is designed for use in both the Proxy and Broker, +the Authorization plugin is designed only for use on the Broker however the Proxy does perform some simple Authorization checks of Roles if authorization is enabled. + +To provide a custom provider, you need to implement the `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, put this class in the Pulsar broker classpath and configure the class in `conf/broker.conf`: + + ```properties + + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + + ``` + +```java + +/** + * Provider of authorization mechanism + */ +public interface AuthorizationProvider extends Closeable { + + /** + * Perform initialization for the authorization provider + * + * @param conf + * broker config object + * @param configCache + * pulsar zk configuration cache service + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration conf, ConfigurationCacheService configCache) throws IOException; + + /** + * Check if the specified role has permission to send messages to the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to send messages to the topic. + */ + CompletableFuture canProduceAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * Check if the specified role has permission to receive messages from the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to receive messages from the topic. + * @param subscription + * the subscription name defined by the client + */ + CompletableFuture canConsumeAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData, String subscription); + + /** + * Check whether the specified role can perform a lookup for the specified topic. + * + * For that the caller needs to have producer or consumer permission. + * + * @param topicName + * @param role + * @return + * @throws Exception + */ + CompletableFuture canLookupAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * + * Grant authorization-action permission on a namespace to the given client + * + * @param namespace + * @param actions + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(NamespaceName namespace, Set actions, String role, + String authDataJson); + + /** + * Grant authorization-action permission on a topic to the given client + * + * @param topicName + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(TopicName topicName, Set actions, String role, + String authDataJson); + +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/security-jwt.md b/site2/website/versioned_docs/version-2.8.x/security-jwt.md new file mode 100644 index 0000000000000..1fa65b7c27f60 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-jwt.md @@ -0,0 +1,331 @@ +--- +id: security-jwt +title: Client authentication using tokens based on JSON Web Tokens +sidebar_label: "Authentication using JWT" +original_id: security-jwt +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +## Token authentication overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +You can use tokens to identify a Pulsar client and associate with some "principal" (or "role") that +is permitted to do some actions (eg: publish to a topic or consume from a topic). + +A user typically gets a token string from the administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like as the following: + +``` + +eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application specifies the token when you create the client instance. An alternative is to pass a "token supplier" (a function that returns the token when the client library needs one). + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. You had better use TLS encryption all the time when you connect to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) for more details. + +### CLI Tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use the token authentication with CLI tools of Pulsar: + +```properties + +webServiceUrl=http://broker.example.com:8080/ +brokerServiceUrl=pulsar://broker.example.com:6650/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +authParams=token:eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +The token string can also be read from a file, for example: + +``` + +authParams=file:///path/to/token/file + +``` + +### Pulsar client + +You can use tokens to authenticate the following Pulsar clients. + +````mdx-code-block + + + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")) + .build(); + +``` + +Similarly, you can also pass a `Supplier`: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token(() -> { + // Read token from custom source + return readToken(); + })) + .build(); + +``` + + + + +```python + +from pulsar import Client, AuthenticationToken + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken('eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY')) + +``` + +Alternatively, you can also pass a `Supplier`: + +```python + +def read_token(): + with open('/path/to/token.txt') as tf: + return tf.read().strip() + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken(read_token)) + +``` + + + + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY"), +}) + +``` + +Similarly, you can also pass a `Supplier`: + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationTokenSupplier(func () string { + // Read token from custom source + return readToken() + }), +}) + +``` + + + + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setAuth(pulsar::AuthToken::createWithToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + + + + +```c# + +var client = PulsarClient.Builder() + .AuthenticateUsingToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY") + .Build(); + +``` + + + + +```` + +## Enable token authentication + +On how to enable token authentication on a Pulsar cluster, you can refer to the guide below. + +JWT supports two different kinds of keys in order to generate and validate the tokens: + + * Symmetric : + - You can use a single ***Secret*** key to generate and validate tokens. + * Asymmetric: A pair of keys consists of the Private key and the Public key. + - You can use ***Private*** key to generate tokens. + - You can use ***Public*** key to validate tokens. + +### Create a secret key + +When you use a secret key, the administrator creates the key and uses the key to generate the client tokens. You can also configure this key to brokers in order to validate the clients. + +Output file is generated in the root of your Pulsar installation directory. You can also provide absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +Enter this command to generate base64 encoded private key. + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Create a key pair + +With Public and Private keys, you need to create a pair of keys. Pulsar supports all algorithms that the Java JWT library (shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys)) supports. + +Output file is generated in the root of your Pulsar installation directory. You can also provide absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * Store `my-private.key` in a safe location and only administrator can use `my-private.key` to generate new tokens. + * `my-public.key` is distributed to all Pulsar brokers. You can publicly share this file without any security concern. + +### Generate tokens + +A token is the credential associated with a user. The association is done through the "principal" or "role". In the case of JWT tokens, this field is typically referred as **subject**, though they are exactly the same concept. + +Then, you need to use this command to require the generated token to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This command prints the token string on stdout. + +Similarly, you can create a token by passing the "private" key using the command below: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, you can enter the following command to create a token with a pre-defined TTL. And then the token is automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +### Authorization + +The token itself does not have any permission associated. The authorization engine determines whether the token should have permissions or not. Once you have created the token, you can grant permission for this token to do certain actions. The following is an example. + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +### Enable token authentication on Brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### Enable token authentication on Proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`: + +The proxy uses its own token when connecting to brokers. You need to configure the role token for this key pair in the `proxyRoles` of the brokers. For more details, see the [authorization guide](security-authorization.md). + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token + +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/security-kerberos.md b/site2/website/versioned_docs/version-2.8.x/security-kerberos.md new file mode 100644 index 0000000000000..c49fa3bea1fce --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-kerberos.md @@ -0,0 +1,443 @@ +--- +id: security-kerberos +title: Authentication using Kerberos +sidebar_label: "Authentication using Kerberos" +original_id: security-kerberos +--- + +[Kerberos](https://web.mit.edu/kerberos/) is a network authentication protocol. By using secret-key cryptography, [Kerberos](https://web.mit.edu/kerberos/) is designed to provide strong authentication for client applications and server applications. + +In Pulsar, you can use Kerberos with [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) as a choice for authentication. And Pulsar uses the [Java Authentication and Authorization Service (JAAS)](https://en.wikipedia.org/wiki/Java_Authentication_and_Authorization_Service) for SASL configuration. You need to provide JAAS configurations for Kerberos authentication. + +This document introduces how to configure `Kerberos` with `SASL` between Pulsar clients and brokers and how to configure Kerberos for Pulsar proxy in detail. + +## Configuration for Kerberos between Client and Broker + +### Prerequisites + +To begin, you need to set up (or already have) a [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center). Also you need to configure and run the [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center)in advance. + +If your organization already uses a Kerberos server (for example, by using `Active Directory`), you do not have to install a new server for Pulsar. If your organization does not use a Kerberos server, you need to install one. Your Linux vendor might have packages for `Kerberos`. On how to install and configure Kerberos, refer to [Ubuntu](https://help.ubuntu.com/community/Kerberos), +[Redhat](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Managing_Smart_Cards/installing-kerberos.html). + +Note that if you use Oracle Java, you need to download JCE policy files for your Java version and copy them to the `$JAVA_HOME/jre/lib/security` directory. + +#### Kerberos principals + +If you use the existing Kerberos system, ask your Kerberos administrator for a principal for each Brokers in your cluster and for every operating system user that accesses Pulsar with Kerberos authentication(via clients and tools). + +If you have installed your own Kerberos system, you can create these principals with the following commands: + +```shell + +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +Note that *Kerberos* requires that all your hosts can be resolved with their FQDNs. + +The first part of Broker principal (for example, `broker` in `broker/{hostname}@{REALM}`) is the `serverType` of each host. The suggested values of `serverType` are `broker` (host machine runs service Pulsar Broker) and `proxy` (host machine runs service Pulsar Proxy). + +#### Configure how to connect to KDC + +You need to enter the command below to specify the path to the `krb5.conf` file for the client side and the broker side. The content of `krb5.conf` file indicates the default Realm and KDC information. See [JDK’s Kerberos Requirements](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html) for more details. + +```shell + +-Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +Here is an example of the krb5.conf file: + +In the configuration file, `EXAMPLE.COM` is the default realm; `kdc = localhost:62037` is the kdc server url for realm `EXAMPLE.COM `: + +``` + +[libdefaults] + default_realm = EXAMPLE.COM + +[realms] + EXAMPLE.COM = { + kdc = localhost:62037 + } + +``` + +Usually machines configured with kerberos already have a system wide configuration and this configuration is optional. + +#### JAAS configuration file + +You need JAAS configuration file for the client side and the broker side. JAAS configuration file provides the section of information that is used to connect KDC. Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +You need to set the `JAAS` configuration file path as JVM parameter for client and broker. For example: + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf + +``` + +In the `pulsar_jaas.conf` file above + +1. `PulsarBroker` is a section name in the JAAS file that each broker uses. This section tells the broker to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarBroker` allows the broker to use the keytab specified in this section. +2. `PulsarClient` is a section name in the JASS file that each broker uses. This section tells the client to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarClient` allows the client to use the keytab specified in this section. + The following example also reuses this `PulsarClient` section in both the Pulsar internal admin configuration and in CLI command of `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`. You can also add different sections for different use cases. + +You can have 2 separate JAAS configuration files: +* the file for a broker that has sections of both `PulsarBroker` and `PulsarClient`; +* the file for a client that only has a `PulsarClient` section. + + +### Kerberos configuration for Brokers + +#### Configure the `broker.conf` file + + In the `broker.conf` file, set Kerberos related configurations. + + - Set `authenticationEnabled` to `true`; + - Set `authenticationProviders` to choose `AuthenticationProviderSasl`; + - Set `saslJaasClientAllowedIds` regex for principal that is allowed to connect to broker; + - Set `saslJaasBrokerSectionName` that corresponds to the section in JAAS configuration file for broker; + + To make Pulsar internal admin client work properly, you need to set the configuration in the `broker.conf` file as below: + - Set `brokerClientAuthenticationPlugin` to client plugin `AuthenticationSasl`; + - Set `brokerClientAuthenticationParameters` to value in JSON string `{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}`, in which `PulsarClient` is the section name in the `pulsar_jaas.conf` file, and `"serverType":"broker"` indicates that the internal admin client connects to a Pulsar Broker; + + Here is an example: + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +## Authentication settings of the broker itself. Used when the broker connects to other brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +#### Set Broker JVM parameter + + Set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_env.sh) + +You must ensure that the operating system user who starts broker can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +### Kerberos configuration for clients + +#### Java Client and Java Admin Client + +In client application, include `pulsar-client-auth-sasl` in your project dependency. + +``` + + + org.apache.pulsar + pulsar-client-auth-sasl + ${pulsar.version} + + +``` + +Configure the authentication type to use `AuthenticationSasl`, and also provide the authentication parameters to it. + +You need 2 parameters: +- `saslJaasClientSectionName`. This parameter corresponds to the section in JAAS configuration file for client; +- `serverType`. This parameter stands for whether this client connects to broker or proxy. And client uses this parameter to know which server side principal should be used. + +When you authenticate between client and broker with the setting in above JAAS configuration file, we need to set `saslJaasClientSectionName` to `PulsarClient` and set `serverType` to `broker`. + +The following is an example of creating a Java client: + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "broker"); + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +You must ensure that the operating system user who starts pulsar client can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +#### Configure CLI tools + +If you use a command-line tool (such as `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`), you need to perform the following steps: + +Step 1. Enter the command below to configure your `client.conf`. + +```shell + +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +authParams={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +Step 2. Enter the command below to set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_tools_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_tools_env.sh), +or add this line `OPTS="$OPTS -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf "` directly to the CLI tool script. + +The meaning of configurations is the same as the meaning of configurations in Java client section. + +## Kerberos configuration for working with Pulsar Proxy + +With the above configuration, client and broker can do authentication using Kerberos. + +A client that connects to Pulsar Proxy is a little different. Pulsar Proxy (as a SASL Server in Kerberos) authenticates Client (as a SASL client in Kerberos) first; and then Pulsar broker authenticates Pulsar Proxy. + +Now in comparison with the above configuration between client and broker, we show you how to configure Pulsar Proxy as follows. + +### Create principal for Pulsar Proxy in Kerberos + +You need to add new principals for Pulsar Proxy comparing with the above configuration. If you already have principals for client and broker, you only need to add the proxy principal here. + +```shell + +### add Principals for Pulsar Proxy +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey proxy/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{proxy-keytabname}.keytab proxy/{hostname}@{REALM}" +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +### Add a section in JAAS configuration file for Pulsar Proxy + +In comparison with the above configuration, add a new section for Pulsar Proxy in JAAS configuration file. + +Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarProxy { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarproxy.keytab" + principal="proxy/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +### Proxy client configuration + +Pulsar client configuration is similar with client and broker configuration, except that you need to set `serverType` to `proxy` instead of `broker`, for the reason that you need to do the Kerberos authentication between client and proxy. + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "proxy"); // ** here is the different ** + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +### Kerberos configuration for Pulsar proxy service + +In the `proxy.conf` file, set Kerberos related configuration. Here is an example: + +```shell + +## related to authenticate client. +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarProxy + +## related to be authenticated by broker +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarProxy", "serverType":"broker"} +forwardAuthorizationCredentials=true + +``` + +The first part relates to authenticating between client and Pulsar Proxy. In this phase, client works as SASL client, while Pulsar Proxy works as SASL server. + +The second part relates to authenticating between Pulsar Proxy and Pulsar Broker. In this phase, Pulsar Proxy works as SASL client, while Pulsar Broker works as SASL server. + +### Broker side configuration. + +The broker side configuration file is the same with the above `broker.conf`, you do not need special configuration for Pulsar Proxy. + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +``` + +## Regarding authorization and role token + +For Kerberos authentication, we usually use the authenticated principal as the role token for Pulsar authorization. For more information of authorization in Pulsar, see [security authorization](security-authorization.md). + +If you enable 'authorizationEnabled', you need to set `superUserRoles` in `broker.conf` that corresponds to the name registered in kdc. + +For example: + +```bash + +superUserRoles=client/{clientIp}@EXAMPLE.COM + +``` + +## Regarding authentication between ZooKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Zookeeper. According to [ZooKeeper document](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Client-Server+mutual+authentication), you need these settings in `conf/zookeeper.conf`: + +``` + +authProvider.1=org.apache.zookeeper.server.auth.SASLAuthenticationProvider +requireClientAuthScheme=sasl + +``` + +Enter the following commands to add a section of `Client` configurations in the file `pulsar_jaas.conf`, which Pulsar Broker uses: + +``` + + Client { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with ZooKeeper. + +## Regarding authentication between BookKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Bookie. According to [BookKeeper document](http://bookkeeper.apache.org/docs/latest/security/sasl/), you need to add `bookkeeperClientAuthenticationPlugin` parameter in `broker.conf`: + +``` + +bookkeeperClientAuthenticationPlugin=org.apache.bookkeeper.sasl.SASLClientProviderFactory + +``` + +In this setting, `SASLClientProviderFactory` creates a BookKeeper SASL client in a Broker, and the Broker uses the created SASL client to authenticate with a Bookie node. + +Enter the following commands to add a section of `BookKeeper` configurations in the `pulsar_jaas.conf` that Pulsar Broker uses: + +``` + + BookKeeper { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with Bookie. diff --git a/site2/website/versioned_docs/version-2.8.x/security-oauth2.md b/site2/website/versioned_docs/version-2.8.x/security-oauth2.md new file mode 100644 index 0000000000000..46c87672dbbd0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-oauth2.md @@ -0,0 +1,232 @@ +--- +id: security-oauth2 +title: Client authentication using OAuth 2.0 access tokens +sidebar_label: "Authentication using OAuth 2.0 access tokens" +original_id: security-oauth2 +--- + +Pulsar supports authenticating clients using OAuth 2.0 access tokens. You can use OAuth 2.0 access tokens to identify a Pulsar client and associate the Pulsar client with some "principal" (or "role"), which is permitted to do some actions, such as publishing messages to a topic or consume messages from a topic. + +This module is used to support the Pulsar client authentication plugin for OAuth 2.0. After communicating with the Oauth 2.0 server, the Pulsar client gets an `access token` from the Oauth 2.0 server, and passes this `access token` to the Pulsar broker to do the authentication. The broker can use the `org.apache.pulsar.broker.authentication.AuthenticationProviderToken`. Or, you can add your own `AuthenticationProvider` to make it with this module. + +## Authentication provider configuration + +This library allows you to authenticate the Pulsar client by using an access token that is obtained from an OAuth 2.0 authorization service, which acts as a _token issuer_. + +### Authentication types + +The authentication type determines how to obtain an access token through an OAuth 2.0 authorization flow. + +:::note + +Currently, the Pulsar Java client only supports the `client_credentials` authentication type. + +::: + +#### Client credentials + +The following table lists parameters supported for the `client credentials` authentication type. + +| Parameter | Description | Example | Required or not | +| --- | --- | --- | --- | +| `type` | Oauth 2.0 authentication type. | `client_credentials` (default) | Optional | +| `issuerUrl` | URL of the authentication provider which allows the Pulsar client to obtain an access token | `https://accounts.google.com` | Required | +| `privateKey` | URL to a JSON credentials file | Support the following pattern formats:
  • `file:///path/to/file`
  • `file:/path/to/file`
  • `data:application/json;base64,`
  • | Required | +| `audience` | An OAuth 2.0 "resource server" identifier for the Pulsar cluster | `https://broker.example.com` | Optional | + +The credentials file contains service account credentials used with the client authentication type. The following shows an example of a credentials file `credentials_file.json`. + +```json + +{ + "type": "client_credentials", + "client_id": "d9ZyX97q1ef8Cr81WHVC4hFQ64vSlDK3", + "client_secret": "on1uJ...k6F6R", + "client_email": "1234567890-abcdefghijklmnopqrstuvwxyz@developer.gserviceaccount.com", + "issuer_url": "https://accounts.google.com" +} + +``` + +In the above example, the authentication type is set to `client_credentials` by default. And the fields "client_id" and "client_secret" are required. + +### Typical original OAuth2 request mapping + +The following shows a typical original OAuth2 request, which is used to obtain the access token from the OAuth2 server. + +```bash + +curl --request POST \ + --url https://dev-kt-aa9ne.us.auth0.com/oauth/token \ + --header 'content-type: application/json' \ + --data '{ + "client_id":"Xd23RHsUnvUlP7wchjNYOaIfazgeHd9x", + "client_secret":"rT7ps7WY8uhdVuBTKWZkttwLdQotmdEliaM5rLfmgNibvqziZ-g07ZH52N_poGAb", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "grant_type":"client_credentials"}' + +``` + +In the above example, the mapping relationship is shown as below. + +- The `issuerUrl` parameter in this plugin is mapped to `--url https://dev-kt-aa9ne.us.auth0.com`. +- The `privateKey` file parameter in this plugin should at least contains the `client_id` and `client_secret` fields. +- The `audience` parameter in this plugin is mapped to `"audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"`. This field is optional and only used by some identity providers in 2.8.2 and later versions. + +## Client Configuration + +You can use the OAuth2 authentication provider with the following Pulsar clients. + +### Java + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +import org.apache.pulsar.client.impl.auth.oauth2.AuthenticationFactoryOAuth2; + +String issuerUrl = "https://dev-kt-aa9ne.us.auth0.com"; +String credentialsUrl = "file:///path/to/KeyFile.json"; +String audience = "https://dev-kt-aa9ne.us.auth0.com/api/v2/"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(issuerUrl, credentialsUrl, audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"./key/path/..","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + +### C++ client + +The C++ client is similar to the Java client. You need to provide parameters of `issuerUrl`, `private_key` (the credentials file path), and the audience. + +```c++ + +#include + +pulsar::ClientConfiguration config; +std::string params = R"({ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "../../pulsar-broker/src/test/resources/authentication/token/cpp_credentials_file.json", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/"})"; + +config.setAuth(pulsar::AuthOauth2::create(params)); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + +### Go client + +To enable OAuth2 authentication in Go client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Go client. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + +### Python client + +To enable OAuth2 authentication in Python client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Python client. + +```python + +from pulsar import Client, AuthenticationOauth2 + +params = ''' +{ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "/path/to/privateKey", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/" +} +''' + +client = Client("pulsar://my-cluster:6650", authentication=AuthenticationOauth2(params)) + +``` + +## CLI configuration + +This section describes how to use Pulsar CLI tools to connect a cluster through OAuth2 authentication plugin. + +### pulsar-admin + +This example shows how to use pulsar-admin to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-admin --admin-url https://streamnative.cloud:443 \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +tenants list + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-client + +This example shows how to use pulsar-client to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-client \ +--url SERVICE_URL \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +produce test-topic -m "test-message" -n 10 + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-perf + +This example shows how to use pulsar-perf to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-perf produce --service-url pulsar+ssl://streamnative.cloud:6651 \ +--auth_plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +-r 1000 -s 1024 test-topic + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/security-overview.md b/site2/website/versioned_docs/version-2.8.x/security-overview.md new file mode 100644 index 0000000000000..227dd5b5d4fc3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-overview.md @@ -0,0 +1,36 @@ +--- +id: security-overview +title: Pulsar security overview +sidebar_label: "Overview" +original_id: security-overview +--- + +As the central message bus for a business, Apache Pulsar is frequently used for storing mission-critical data. Therefore, enabling security features in Pulsar is crucial. + +By default, Pulsar configures no encryption, authentication, or authorization. Any client can communicate to Apache Pulsar via plain text service URLs. So we must ensure that Pulsar accessing via these plain text service URLs is restricted to trusted clients only. In such cases, you can use Network segmentation and/or authorization ACLs to restrict access to trusted IPs. If you use neither, the state of cluster is wide open and anyone can access the cluster. + +Pulsar supports a pluggable authentication mechanism. And Pulsar clients use this mechanism to authenticate with brokers and proxies. You can also configure Pulsar to support multiple authentication sources. + +The Pulsar broker validates the authentication credentials when a connection is established. After the initial connection is authenticated, the "principal" token is stored for authorization though the connection is not re-authenticated. The broker periodically checks the expiration status of every `ServerCnx` object. You can set the `authenticationRefreshCheckSeconds` on the broker to control the frequency to check the expiration status. By default, the `authenticationRefreshCheckSeconds` is set to 60s. When the authentication is expired, the broker forces to re-authenticate the connection. If the re-authentication fails, the broker disconnects the client. + +The broker supports learning whether a particular client supports authentication refreshing. If a client supports authentication refreshing and the credential is expired, the authentication provider calls the `refreshAuthentication` method to initiate the refreshing process. If a client does not support authentication refreshing and the credential is expired, the broker disconnects the client. + +You had better secure the service components in your Apache Pulsar deployment. + +## Role tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, which can represent a single client or multiple clients. You can use roles to control permission for clients to produce or consume from certain topics, administer the configuration for tenants, and so on. + +Apache Pulsar uses a [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign a *role token* to that client. This role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication providers + +Currently Pulsar supports the following authentication providers: + +- [TLS Authentication](security-tls-authentication.md) +- [Athenz](security-athenz.md) +- [Kerberos](security-kerberos.md) +- [JSON Web Token Authentication](security-jwt.md) +- [OAuth 2.0 authentication](security-oauth2.md) +- [HTTP basic authentication](security-basic-auth.md) + diff --git a/site2/website/versioned_docs/version-2.8.x/security-tls-authentication.md b/site2/website/versioned_docs/version-2.8.x/security-tls-authentication.md new file mode 100644 index 0000000000000..85d2240f41306 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-tls-authentication.md @@ -0,0 +1,222 @@ +--- +id: security-tls-authentication +title: Authentication using TLS +sidebar_label: "Authentication using TLS" +original_id: security-tls-authentication +--- + +## TLS authentication overview + +TLS authentication is an extension of [TLS transport encryption](security-tls-transport.md). Not only servers have keys and certs that the client uses to verify the identity of servers, clients also have keys and certs that the server uses to verify the identity of clients. You must have TLS transport encryption configured on your cluster before you can use TLS authentication. This guide assumes you already have TLS transport encryption configured. + +`Bouncy Castle Provider` provides TLS related cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +### Create client certificates + +Client certificates are generated using the certificate authority. Server certificates are also generated with the same certificate authority. + +The biggest difference between client certs and server certs is that the **common name** for the client certificate is the **role token** which that client is authenticated as. + +To use client certificates, you need to set `tlsRequireTrustedClientCertOnConnect=true` at the broker side. For details, refer to [TLS broker configuration](security-tls-transport.md#configure-broker). + +First, you need to enter the following command to generate the key : + +```bash + +$ openssl genrsa -out admin.key.pem 2048 + +``` + +Similar to the broker, the client expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so you need to convert it by entering the following command: + +```bash + +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in admin.key.pem -out admin.key-pk8.pem -nocrypt + +``` + +Next, enter the command below to generate the certificate request. When you are asked for a **common name**, enter the **role token** that you want this key pair to authenticate a client as. + +```bash + +$ openssl req -config openssl.cnf \ + -key admin.key.pem -new -sha256 -out admin.csr.pem + +``` + +:::note + +If openssl.cnf is not specified, read [Certificate authority](http://pulsar.apache.org/docs/en/security-tls-transport/#certificate-authority) to get the openssl.cnf. + +::: + +Then, enter the command below to sign with request with the certificate authority. Note that the client certs uses the **usr_cert** extension, which allows the cert to be used for client authentication. + +```bash + +$ openssl ca -config openssl.cnf -extensions usr_cert \ + -days 1000 -notext -md sha256 \ + -in admin.csr.pem -out admin.cert.pem + +``` + +You can get a cert, `admin.cert.pem`, and a key, `admin.key-pk8.pem` from this command. With `ca.cert.pem`, clients can use this cert and this key to authenticate themselves to brokers and proxies as the role token ``admin``. + +:::note + +If the "unable to load CA private key" error occurs and the reason of this error is "No such file or directory: /etc/pki/CA/private/cakey.pem" in this step. Try the command below: + +```bash + +$ cd /etc/pki/tls/misc/CA +$ ./CA -newca + +``` + +to generate `cakey.pem` . + +::: + +## Enable TLS authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#broker-configuration): + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# operations and publish/consume from all topics +superUserRoles=admin + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters={"tlsCertFile":"/path/my-ca/admin.cert.pem","tlsKeyFile":"/path/my-ca/admin.key-pk8.pem"} +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +``` + +## Enable TLS authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#proxy-configuration): + +The proxy should have its own client key pair for connecting to brokers. You need to configure the role token for this key pair in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/to/proxy.cert.pem,tlsKeyFile:/path/to/proxy.key-pk8.pem + +``` + +## Client configuration + +When you use TLS authentication, client connects via TLS transport. You need to configure the client to use ```https://``` and 8443 port for the web service URL, ```pulsar+ssl://``` and 6651 port for the broker service URL. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS authentication with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem + +``` + +### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .authentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", + "tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + .build(); + +``` + +### Python client + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + +### C++ client + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); + +pulsar::AuthenticationPtr auth = pulsar::AuthTls::create("/path/to/my-role.cert.pem", + "/path/to/my-role.key-pk8.pem") +config.setAuth(auth); + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); + +``` + +### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const auth = new Pulsar.AuthenticationTls({ + certificatePath: '/path/to/my-role.cert.pem', + privateKeyPath: '/path/to/my-role.key-pk8.pem', + }); + + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + authentication: auth, + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + }); +})(); + +``` + +### C# client + +```c# + +var clientCertificate = new X509Certificate2("admin.pfx"); +var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/security-tls-keystore.md b/site2/website/versioned_docs/version-2.8.x/security-tls-keystore.md new file mode 100644 index 0000000000000..170bb6697bc23 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-tls-keystore.md @@ -0,0 +1,342 @@ +--- +id: security-tls-keystore +title: Using TLS with KeyStore configure +sidebar_label: "Using TLS with KeyStore configure" +original_id: security-tls-keystore +--- + +## Overview + +Apache Pulsar supports [TLS encryption](security-tls-transport.md) and [TLS authentication](security-tls-authentication.md) between clients and Apache Pulsar service. +By default it uses PEM format file configuration. This page tries to describe use [KeyStore](https://en.wikipedia.org/wiki/Java_KeyStore) type configure for TLS. + + +## TLS encryption with KeyStore configure + +### Generate TLS key and certificate + +The first step of deploying TLS is to generate the key and the certificate for each machine in the cluster. +You can use Java’s `keytool` utility to accomplish this task. We will generate the key into a temporary keystore +initially for broker, so that we can export and sign it later with CA. + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -validity {validity} -genkeypair -keyalg RSA + +``` + +You need to specify two parameters in the above command: + +1. `keystore`: the keystore file that stores the certificate. The *keystore* file contains the private key of + the certificate; hence, it needs to be kept safely. +2. `validity`: the valid time of the certificate in days. + +> Ensure that common name (CN) matches exactly with the fully qualified domain name (FQDN) of the server. +The client compares the CN with the DNS domain name to ensure that it is indeed connecting to the desired server, not a malicious one. + +### Creating your own CA + +After the first step, each broker in the cluster has a public-private key pair, and a certificate to identify the machine. +The certificate, however, is unsigned, which means that an attacker can create such a certificate to pretend to be any machine. + +Therefore, it is important to prevent forged certificates by signing them for each machine in the cluster. +A `certificate authority (CA)` is responsible for signing certificates. CA works likes a government that issues passports — +the government stamps (signs) each passport so that the passport becomes difficult to forge. Other governments verify the stamps +to ensure the passport is authentic. Similarly, the CA signs the certificates, and the cryptography guarantees that a signed +certificate is computationally difficult to forge. Thus, as long as the CA is a genuine and trusted authority, the clients have +high assurance that they are connecting to the authentic machines. + +```shell + +openssl req -new -x509 -keyout ca-key -out ca-cert -days 365 + +``` + +The generated CA is simply a *public-private* key pair and certificate, and it is intended to sign other certificates. + +The next step is to add the generated CA to the clients' truststore so that the clients can trust this CA: + +```shell + +keytool -keystore client.truststore.jks -alias CARoot -import -file ca-cert + +``` + +NOTE: If you configure the brokers to require client authentication by setting `tlsRequireTrustedClientCertOnConnect` to `true` on the +broker configuration, then you must also provide a truststore for the brokers and it should have all the CA certificates that clients keys were signed by. + +```shell + +keytool -keystore broker.truststore.jks -alias CARoot -import -file ca-cert + +``` + +In contrast to the keystore, which stores each machine’s own identity, the truststore of a client stores all the certificates +that the client should trust. Importing a certificate into one’s truststore also means trusting all certificates that are signed +by that certificate. As the analogy above, trusting the government (CA) also means trusting all passports (certificates) that +it has issued. This attribute is called the chain of trust, and it is particularly useful when deploying TLS on a large BookKeeper cluster. +You can sign all certificates in the cluster with a single CA, and have all machines share the same truststore that trusts the CA. +That way all machines can authenticate all other machines. + + +### Signing the certificate + +The next step is to sign all certificates in the keystore with the CA we generated. First, you need to export the certificate from the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -certreq -file cert-file + +``` + +Then sign it with the CA: + +```shell + +openssl x509 -req -CA ca-cert -CAkey ca-key -in cert-file -out cert-signed -days {validity} -CAcreateserial -passin pass:{ca-password} + +``` + +Finally, you need to import both the certificate of the CA and the signed certificate into the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias CARoot -import -file ca-cert +keytool -keystore broker.keystore.jks -alias localhost -import -file cert-signed + +``` + +The definitions of the parameters are the following: + +1. `keystore`: the location of the keystore +2. `ca-cert`: the certificate of the CA +3. `ca-key`: the private key of the CA +4. `ca-password`: the passphrase of the CA +5. `cert-file`: the exported, unsigned certificate of the broker +6. `cert-signed`: the signed certificate of the broker + +### Configuring brokers + +Brokers enable TLS by provide valid `brokerServicePortTls` and `webServicePortTls`, and also need set `tlsEnabledWithKeyStore` to `true` for using KeyStore type configuration. +Besides this, KeyStore path, KeyStore password, TrustStore path, and TrustStore password need to provided. +And since broker will create internal client/admin client to communicate with other brokers, user also need to provide config for them, this is similar to how user config the outside client/admin-client. +If `tlsRequireTrustedClientCertOnConnect` is `true`, broker will reject the Connection if the Client Certificate is not trusted. + +The following TLS configs are needed on the broker side: + +```properties + +tlsEnabledWithKeyStore=true +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw + +``` + +:::note + +It is important to restrict access to the store files via filesystem permissions. + +::: + +In 2.8.2 and later versions, if you have configured TLS on the broker, to disable non-TLS ports, you can set the values of the following configurations to empty. + +```conf +brokerServicePort= +webServicePort= +``` + +In this case, you need to set the following configurations. + +```conf +brokerClientTlsEnabled=true // Set this to true +brokerClientTlsEnabledWithKeyStore=true // Set this to true +brokerClientTlsTrustStore= // Set this to your desired value +brokerClientTlsTrustStorePassword= // Set this to your desired value +``` + +Optional settings that may worth consider: + +1. tlsClientAuthentication=false: Enable/Disable using TLS for authentication. This config when enabled will authenticate the other end + of the communication channel. It should be enabled on both brokers and clients for mutual TLS. +2. tlsCiphers=[TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256], A cipher suite is a named combination of authentication, encryption, MAC and key exchange + algorithm used to negotiate the security settings for a network connection using TLS network protocol. By default, + it is null. [OpenSSL Ciphers](https://www.openssl.org/docs/man1.0.2/apps/ciphers.html) + [JDK Ciphers](http://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#ciphersuites) +3. tlsProtocols=[TLSv1.3,TLSv1.2] (list out the TLS protocols that you are going to accept from clients). + By default, it is not set. + +### Configuring Clients + +This is similar to [TLS encryption configuing for client with PEM type](security-tls-transport.md#Client configuration). +For a a minimal configuration, user need to provide the TrustStore information. + +e.g. +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + + ``` + +1. for java admin client + +```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + +``` + +## TLS authentication with KeyStore configure + +This similar to [TLS authentication with PEM type](security-tls-authentication.md) + +### broker authentication config + +`broker.conf` + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# this should be the CN for one of client keystore. +superUserRoles=admin + +# Enable KeyStore type +tlsEnabledWithKeyStore=true +requireTrustedClientCertOnConnect=true + +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw +# internal auth config +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls +brokerClientAuthenticationParameters={"keyStoreType":"JKS","keyStorePath":"/var/private/tls/client.keystore.jks","keyStorePassword":"clientpw"} +# currently websocket not support keystore type +webSocketServiceEnabled=false + +``` + +### client authentication configuring + +Besides the TLS encryption configuring. The main work is configuring the KeyStore, which contains a valid CN as client role, for client. + +e.g. +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls + authParams={"keyStoreType":"JKS","keyStorePath":"/path/to/keystorefile","keyStorePassword":"keystorepw"} + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +1. for java admin client + + ```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +## Enabling TLS Logging + +You can enable TLS debug logging at the JVM level by starting the brokers and/or clients with `javax.net.debug` system property. For example: + +```shell + +-Djavax.net.debug=all + +``` + +You can find more details on this in [Oracle documentation](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html) on [debugging SSL/TLS connections](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html). diff --git a/site2/website/versioned_docs/version-2.8.x/security-tls-transport.md b/site2/website/versioned_docs/version-2.8.x/security-tls-transport.md new file mode 100644 index 0000000000000..2cad17a78c350 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-tls-transport.md @@ -0,0 +1,295 @@ +--- +id: security-tls-transport +title: Transport Encryption using TLS +sidebar_label: "Transport Encryption using TLS" +original_id: security-tls-transport +--- + +## TLS overview + +By default, Apache Pulsar clients communicate with the Apache Pulsar service in plain text. This means that all data is sent in the clear. You can use TLS to encrypt this traffic to protect the traffic from the snooping of a man-in-the-middle attacker. + +You can also configure TLS for both encryption and authentication. Use this guide to configure just TLS transport encryption and refer to [here](security-tls-authentication.md) for TLS authentication configuration. Alternatively, you can use [another authentication mechanism](security-athenz.md) on top of TLS transport encryption. + +> Note that enabling TLS may impact the performance due to encryption overhead. + +## TLS concepts + +TLS is a form of [public key cryptography](https://en.wikipedia.org/wiki/Public-key_cryptography). Using key pairs consisting of a public key and a private key can perform the encryption. The public key encrpyts the messages and the private key decrypts the messages. + +To use TLS transport encryption, you need two kinds of key pairs, **server key pairs** and a **certificate authority**. + +You can use a third kind of key pair, **client key pairs**, for [client authentication](security-tls-authentication.md). + +You should store the **certificate authority** private key in a very secure location (a fully encrypted, disconnected, air gapped computer). As for the certificate authority public key, the **trust cert**, you can freely shared it. + +For both client and server key pairs, the administrator first generates a private key and a certificate request, then uses the certificate authority private key to sign the certificate request, finally generates a certificate. This certificate is the public key for the server/client key pair. + +For TLS transport encryption, the clients can use the **trust cert** to verify that the server has a key pair that the certificate authority signed when the clients are talking to the server. A man-in-the-middle attacker does not have access to the certificate authority, so they couldn't create a server with such a key pair. + +For TLS authentication, the server uses the **trust cert** to verify that the client has a key pair that the certificate authority signed. The common name of the **client cert** is then used as the client's role token (see [Overview](security-overview.md)). + +`Bouncy Castle Provider` provides cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +## Create TLS certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [server certificate](#server-certificate), and [client certificate](#client-certificate). + +Follow the guide below to set up a certificate authority. You can also refer to plenty of resources on the internet for more details. We recommend [this guide](https://jamielinux.com/docs/openssl-certificate-authority/index.html) for your detailed reference. + +### Certificate authority + +1. Create the certificate for the CA. You can use CA to sign both the broker and client certificates. This ensures that each party will trust the others. You should store CA in a very secure location (ideally completely disconnected from networks, air gapped, and fully encrypted). + +2. Entering the following command to create a directory for your CA, and place [this openssl configuration file](https://github.com/apache/pulsar/tree/master/site2/website/static/examples/openssl.cnf) in the directory. You may want to modify the default answers for company name and department in the configuration file. Export the location of the CA directory to the environment variable, CA_HOME. The configuration file uses this environment variable to find the rest of the files and directories that the CA needs. + +```bash + +mkdir my-ca +cd my-ca +wget https://raw.githubusercontent.com/apache/pulsar-site/main/site2/website/static/examples/openssl.cnf +export CA_HOME=$(pwd) + +``` + +3. Enter the commands below to create the necessary directories, keys and certs. + +```bash + +mkdir certs crl newcerts private +chmod 700 private/ +touch index.txt +echo 1000 > serial +openssl genrsa -aes256 -out private/ca.key.pem 4096 +chmod 400 private/ca.key.pem +openssl req -config openssl.cnf -key private/ca.key.pem \ + -new -x509 -days 7300 -sha256 -extensions v3_ca \ + -out certs/ca.cert.pem +chmod 444 certs/ca.cert.pem + +``` + +4. After you answer the question prompts, CA-related files are stored in the `./my-ca` directory. Within that directory: + +* `certs/ca.cert.pem` is the public certificate. This public certificates is meant to be distributed to all parties involved. +* `private/ca.key.pem` is the private key. You only need it when you are signing a new certificate for either broker or clients and you must safely guard this private key. + +### Server certificate + +Once you have created a CA certificate, you can create certificate requests and sign them with the CA. + +The following commands ask you a few questions and then create the certificates. When you are asked for the common name, you should match the hostname of the broker. You can also use a wildcard to match a group of broker hostnames, for example, `*.broker.usw.example.com`. This ensures that multiple machines can reuse the same certificate. + +:::tip + +Sometimes matching the hostname is not possible or makes no sense, +such as when you create the brokers with random hostnames, or you +plan to connect to the hosts via their IP. In these cases, you +should configure the client to disable TLS hostname verification. For more +details, you can see [the host verification section in client configuration](#hostname-verification). + +::: + +1. Enter the command below to generate the key. + +```bash + +openssl genrsa -out broker.key.pem 2048 + +``` + +The broker expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so enter the following command to convert it. + +```bash + +openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in broker.key.pem -out broker.key-pk8.pem -nocrypt + +``` + +2. Enter the following command to generate the certificate request. + +```bash + +openssl req -config openssl.cnf \ + -key broker.key.pem -new -sha256 -out broker.csr.pem + +``` + +3. Sign it with the certificate authority by entering the command below. + +```bash + +openssl ca -config openssl.cnf -extensions server_cert \ + -days 1000 -notext -md sha256 \ + -in broker.csr.pem -out broker.cert.pem + +``` + +At this point, you have a cert, `broker.cert.pem`, and a key, `broker.key-pk8.pem`, which you can use along with `ca.cert.pem` to configure TLS transport encryption for your broker and proxy nodes. + +## Configure broker + +To configure a Pulsar [broker](reference-terminology.md#broker) to use TLS transport encryption, you need to make some changes to `broker.conf`, which locates in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties + +tlsEnabled=true +tlsRequireTrustedClientCertOnConnect=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +> You can find a full list of parameters available in the `conf/broker.conf` file, +> as well as the default values for those parameters, in [Broker Configuration](reference-configuration.md#broker) +> +### TLS Protocol Version and Cipher + +You can configure the broker (and proxy) to require specific TLS protocol versions and ciphers for TLS negiotation. You can use the TLS protocol versions and ciphers to stop clients from requesting downgraded TLS protocol versions or ciphers that may have weaknesses. + +Both the TLS protocol versions and cipher properties can take multiple values, separated by commas. The possible values for protocol version and ciphers depend on the TLS provider that you are using. Pulsar uses OpenSSL if the OpenSSL is available, but if the OpenSSL is not available, Pulsar defaults back to the JDK implementation. + +```properties + +tlsProtocols=TLSv1.3,TLSv1.2 +tlsCiphers=TLS_DH_RSA_WITH_AES_256_GCM_SHA384,TLS_DH_RSA_WITH_AES_256_CBC_SHA + +``` + +OpenSSL currently supports ```TLSv1.1```, ```TLSv1.2``` and ```TLSv1.3``` for the protocol version. You can acquire a list of supported cipher from the openssl ciphers command, i.e. ```openssl ciphers -tls1_3```. + +For JDK 11, you can obtain a list of supported values from the documentation: +- [TLS protocol](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSEPROVIDERPROTOCOLPARAMETERS-BBF75009) +- [Ciphers](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSE_CIPHER_SUITES) + +## Proxy Configuration + +Proxies need to configure TLS in two directions, for clients connecting to the proxy, and for the proxy connecting to brokers. + +```properties + +# For clients connecting to the proxy +tlsEnabledInProxy=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +# For the proxy to connect to brokers +tlsEnabledWithBroker=true +brokerClientTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +## Client configuration + +When you enable the TLS transport encryption, you need to configure the client to use ```https://``` and port 8443 for the web service URL, and ```pulsar+ssl://``` and port 6651 for the broker service URL. + +As the server certificate that you generated above does not belong to any of the default trust chains, you also need to either specify the path the **trust cert** (recommended), or tell the client to allow untrusted server certs. + +### Hostname verification + +Hostname verification is a TLS security feature whereby a client can refuse to connect to a server if the "CommonName" does not match the hostname to which the hostname is connecting. By default, Pulsar clients disable hostname verification, as it requires that each broker has a DNS record and a unique cert. + +Moreover, as the administrator has full control of the certificate authority, a bad actor is unlikely to be able to pull off a man-in-the-middle attack. "allowInsecureConnection" allows the client to connect to servers whose cert has not been signed by an approved CA. The client disables "allowInsecureConnection" by default, and you should always disable "allowInsecureConnection" in production environments. As long as you disable "allowInsecureConnection", a man-in-the-middle attack requires that the attacker has access to the CA. + +One scenario where you may want to enable hostname verification is where you have multiple proxy nodes behind a VIP, and the VIP has a DNS record, for example, pulsar.mycompany.com. In this case, you can generate a TLS cert with pulsar.mycompany.com as the "CommonName," and then enable hostname verification on the client. + +The examples below show that hostname verification is disabled for the CLI tools/Java/Python/C++/Node.js/C# clients by default. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools.md#pulsar-admin), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS transport with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +tlsEnableHostnameVerification=false + +``` + +#### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .enableTlsHostnameVerification(false) // false by default, in any case + .allowTlsInsecureConnection(false) // false by default, in any case + .build(); + +``` + +#### Python client + +```python + +from pulsar import Client + +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_hostname_verification=False, + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False) // defaults to false from v2.2.0 onwards + +``` + +#### C++ client + +```c++ + +#include + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); // shouldn't be needed soon +config.setTlsTrustCertsFilePath(caPath); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create(clientPublicKeyPath, clientPrivateKeyPath)); +config.setValidateHostName(false); + +``` + +#### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + useTls: true, + tlsValidateHostname: false, + tlsAllowInsecureConnection: false, + }); +})(); + +``` + +#### C# client + +```c# + +var certificate = new X509Certificate2("ca.cert.pem"); +var client = PulsarClient.Builder() + .TrustedCertificateAuthority(certificate) //If the CA is not trusted on the host, you can add it explicitly. + .VerifyCertificateAuthority(true) //Default is 'true' + .VerifyCertificateName(false) //Default is 'false' + .Build(); + +``` + +> Note that `VerifyCertificateName` refers to the configuration of hostname verification in the C# client. diff --git a/site2/website/versioned_docs/version-2.8.x/security-token-admin.md b/site2/website/versioned_docs/version-2.8.x/security-token-admin.md new file mode 100644 index 0000000000000..a265f6320d28f --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/security-token-admin.md @@ -0,0 +1,183 @@ +--- +id: security-token-admin +title: Token authentication admin +sidebar_label: "Token authentication admin" +original_id: security-token-admin +--- + +## Token Authentication Overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +Tokens are used to identify a Pulsar client and associate with some "principal" (or "role") which +will be then granted permissions to do some actions (eg: publish or consume from a topic). + +A user will typically be given a token string by an administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like: + +``` + + eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application will specify the token when creating the client instance. An alternative is to pass +a "token supplier", that is to say a function that returns the token when the client library +will need one. + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. It is strongly recommended to +> always use TLS encryption when talking to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) + +## Secret vs Public/Private keys + +JWT support two different kind of keys in order to generate and validate the tokens: + + * Symmetric : + - there is a single ***Secret*** key that is used both to generate and validate + * Asymmetric: there is a pair of keys. + - ***Private*** key is used to generate tokens + - ***Public*** key is used to validate tokens + +### Secret key + +When using a secret key, the administrator will create the key and he will +use it to generate the client tokens. This key will be also configured to +the brokers to allow them to validate the clients. + +#### Creating a secret key + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +To generate base64 encoded private key + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Public/Private keys + +With public/private, we need to create a pair of keys. Pulsar supports all algorithms supported by the Java JWT library shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys) + +#### Creating a key pair + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * `my-private.key` will be stored in a safe location and only used by administrator to generate + new tokens. + * `my-public.key` will be distributed to all Pulsar brokers. This file can be publicly shared without + any security concern. + +## Generating tokens + +A token is the credential associated with a user. The association is done through the "principal", +or "role". In case of JWT tokens, this field it's typically referred to as **subject**, though +it's exactly the same concept. + +The generated token is then required to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This will print the token string on stdout. + +Similarly, one can create a token by passing the "private" key: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, a token can also be created with a pre-defined TTL. After that time, +the token will be automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +## Authorization + +The token itself doesn't have any permission associated. That will be determined by the +authorization engine. Once the token is created, one can grant permission for this token to do certain +actions. Eg. : + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +## Enabling Token Authentication ... + +### ... on Brokers + +To configure brokers to authenticate clients, put the following in `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### ... on Proxies + +To configure proxies to authenticate clients, put the following in `proxy.conf`: + +The proxy will have its own token used when talking to brokers. The role token for this +key pair should be configured in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Or, alternatively, read token from file +# brokerClientAuthenticationParameters=file:///path/to/proxy-token.txt + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/sql-deployment-configurations.md b/site2/website/versioned_docs/version-2.8.x/sql-deployment-configurations.md new file mode 100644 index 0000000000000..eef65e40f1402 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/sql-deployment-configurations.md @@ -0,0 +1,208 @@ +--- +id: sql-deployment-configurations +title: Pulsar SQL configuration and deployment +sidebar_label: "Configuration and deployment" +original_id: sql-deployment-configurations +--- + +You can configure Presto Pulsar connector and deploy a cluster with the following instruction. + +## Configure Presto Pulsar Connector +You can configure Presto Pulsar Connector in the `${project.root}/conf/presto/catalog/pulsar.properties` properties file. The configuration for the connector and the default values are as follows. + +```properties + +# name of the connector to be displayed in the catalog +connector.name=pulsar + +# the url of Pulsar broker service +pulsar.web-service-url=http://localhost:8080 + +# URI of Zookeeper cluster +pulsar.zookeeper-uri=localhost:2181 + +# minimum number of entries to read at a single time +pulsar.entry-read-batch-size=100 + +# default number of splits to use per query +pulsar.target-num-splits=4 + +# max size of one batch message (default value is 5MB) +pulsar.max-message-size=5242880 + +``` + +:::note + +`pulsar.max-message-size` is only available in 2.8.2 and later versions. + +::: + +You can connect Presto to a Pulsar cluster with multiple hosts. To configure multiple hosts for brokers, add multiple URLs to `pulsar.web-service-url`. To configure multiple hosts for ZooKeeper, add multiple URIs to `pulsar.zookeeper-uri`. The following is an example. + +``` + +pulsar.web-service-url=http://localhost:8080,localhost:8081,localhost:8082 +pulsar.zookeeper-uri=localhost1,localhost2:2181 + +``` + +**Note: by default, Pulsar SQL does not get the last message in a topic**. It is by design and controlled by settings. By default, BookKeeper LAC only advances when subsequent entries are added. If there is no subsequent entry added, the last written entry is not visible to readers until the ledger is closed. This is not a problem for Pulsar which uses managed ledger, but Pulsar SQL directly reads from BookKeeper ledger. + +If you want to get the last message in a topic, set the following configurations: + +1. For the broker configuration, set `bookkeeperExplicitLacIntervalInMills` > 0 in `broker.conf` or `standalone.conf`. + +2. For the Presto configuration, set `pulsar.bookkeeper-explicit-interval` > 0 and `pulsar.bookkeeper-use-v2-protocol=false`. + +However, using BookKeeper V3 protocol introduces additional GC overhead to BK as it uses Protobuf. + +## Query data from existing Presto clusters + +If you already have a Presto cluster, you can copy the Presto Pulsar connector plugin to your existing cluster. Download the archived plugin package with the following command. + +```bash + +$ wget pulsar:binary_release_url + +``` + +## Deploy a new cluster + +Since Pulsar SQL is powered by [Trino (formerly Presto SQL)](https://trino.io), the configuration for deployment is the same for the Pulsar SQL worker. + +:::note + +For how to set up a standalone single node environment, refer to [Query data](sql-getting-started.md). + +::: + +You can use the same CLI args as the Presto launcher. + +```bash + +$ ./bin/pulsar sql-worker --help +Usage: launcher [options] command + +Commands: run, start, stop, restart, kill, status + +Options: + -h, --help show this help message and exit + -v, --verbose Run verbosely + --etc-dir=DIR Defaults to INSTALL_PATH/etc + --launcher-config=FILE + Defaults to INSTALL_PATH/bin/launcher.properties + --node-config=FILE Defaults to ETC_DIR/node.properties + --jvm-config=FILE Defaults to ETC_DIR/jvm.config + --config=FILE Defaults to ETC_DIR/config.properties + --log-levels-file=FILE + Defaults to ETC_DIR/log.properties + --data-dir=DIR Defaults to INSTALL_PATH + --pid-file=FILE Defaults to DATA_DIR/var/run/launcher.pid + --launcher-log-file=FILE + Defaults to DATA_DIR/var/log/launcher.log (only in + daemon mode) + --server-log-file=FILE + Defaults to DATA_DIR/var/log/server.log (only in + daemon mode) + -D NAME=VALUE Set a Java system property + +``` + +The default configuration for the cluster is located in `${project.root}/conf/presto`. You can customize your deployment by modifying the default configuration. + +You can set the worker to read from a different configuration directory, or set a different directory to write data. + +```bash + +$ ./bin/pulsar sql-worker run --etc-dir /tmp/incubator-pulsar/conf/presto --data-dir /tmp/presto-1 + +``` + +You can start the worker as daemon process. + +```bash + +$ ./bin/pulsar sql-worker start + +``` + +### Deploy a cluster on multiple nodes + +You can deploy a Pulsar SQL cluster or Presto cluster on multiple nodes. The following example shows how to deploy a cluster on three-node cluster. + +1. Copy the Pulsar binary distribution to three nodes. + +The first node runs as Presto coordinator. The minimal configuration requirement in the `${project.root}/conf/presto/config.properties` file is as follows. + +```properties + +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery-server.enabled=true +discovery.uri= + +``` + +The other two nodes serve as worker nodes, you can use the following configuration for worker nodes. + +```properties + +coordinator=false +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery.uri= + +``` + +2. Modify `pulsar.web-service-url` and `pulsar.zookeeper-uri` configuration in the `${project.root}/conf/presto/catalog/pulsar.properties` file accordingly for the three nodes. + +3. Start the coordinator node. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +4. Start worker nodes. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +5. Start the SQL CLI and check the status of your cluster. + +```bash + +$ ./bin/pulsar sql --server + +``` + +6. Check the status of your nodes. + +```bash + +presto> SELECT * FROM system.runtime.nodes; + node_id | http_uri | node_version | coordinator | state +---------+-------------------------+--------------+-------------+-------- + 1 | http://192.168.2.1:8081 | testversion | true | active + 3 | http://192.168.2.2:8081 | testversion | false | active + 2 | http://192.168.2.3:8081 | testversion | false | active + +``` + +For more information about deployment in Presto, refer to [Presto deployment](https://trino.io/docs/current/installation/deployment.html). + +:::note + +The broker does not advance LAC, so when Pulsar SQL bypass broker to query data, it can only read entries up to the LAC that all the bookies learned. You can enable periodically write LAC on the broker by setting "bookkeeperExplicitLacIntervalInMills" in the broker.conf. + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/sql-getting-started.md b/site2/website/versioned_docs/version-2.8.x/sql-getting-started.md new file mode 100644 index 0000000000000..8a5cd7199b365 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/sql-getting-started.md @@ -0,0 +1,187 @@ +--- +id: sql-getting-started +title: Query data with Pulsar SQL +sidebar_label: "Query data" +original_id: sql-getting-started +--- + +Before querying data in Pulsar, you need to install Pulsar and built-in connectors. + +## Requirements +1. Install [Pulsar](getting-started-standalone.md#install-pulsar-standalone). +2. Install Pulsar [built-in connectors](getting-started-standalone.md#install-builtin-connectors-optional). + +## Query data in Pulsar +To query data in Pulsar with Pulsar SQL, complete the following steps. + +1. Start a Pulsar standalone cluster. + +```bash + +./bin/pulsar standalone + +``` + +2. Start a Pulsar SQL worker. + +```bash + +./bin/pulsar sql-worker run + +``` + +3. After initializing Pulsar standalone cluster and the SQL worker, run SQL CLI. + +```bash + +./bin/pulsar sql + +``` + +4. Test with SQL commands. + +```bash + +presto> show catalogs; + Catalog +--------- + pulsar + system +(2 rows) + +Query 20180829_211752_00004_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + + +presto> show schemas in pulsar; + Schema +----------------------- + information_schema + public/default + public/functions + sample/standalone/ns1 +(4 rows) + +Query 20180829_211818_00005_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [4 rows, 89B] [21 rows/s, 471B/s] + + +presto> show tables in pulsar."public/default"; + Table +------- +(0 rows) + +Query 20180829_211839_00006_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + +``` + +Since there is no data in Pulsar, no records is returned. + +5. Start the built-in connector _DataGeneratorSource_ and ingest some mock data. + +```bash + +./bin/pulsar-admin sources create --name generator --destinationTopicName generator_test --source-type data-generator + +``` + +And then you can query a topic in the namespace "public/default". + +```bash + +presto> show tables in pulsar."public/default"; + Table +---------------- + generator_test +(1 row) + +Query 20180829_213202_00000_csyeu, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:02 [1 rows, 38B] [0 rows/s, 17B/s] + +``` + +You can now query the data within the topic "generator_test". + +```bash + +presto> select * from pulsar."public/default".generator_test; + + firstname | middlename | lastname | email | username | password | telephonenumber | age | companyemail | nationalidentitycardnumber | +-------------+-------------+-------------+----------------------------------+--------------+----------+-----------------+-----+-----------------------------------------------+----------------------------+ + Genesis | Katherine | Wiley | genesis.wiley@gmail.com | genesisw | y9D2dtU3 | 959-197-1860 | 71 | genesis.wiley@interdemconsulting.eu | 880-58-9247 | + Brayden | | Stanton | brayden.stanton@yahoo.com | braydens | ZnjmhXik | 220-027-867 | 81 | brayden.stanton@supermemo.eu | 604-60-7069 | + Benjamin | Julian | Velasquez | benjamin.velasquez@yahoo.com | benjaminv | 8Bc7m3eb | 298-377-0062 | 21 | benjamin.velasquez@hostesltd.biz | 213-32-5882 | + Michael | Thomas | Donovan | donovan@mail.com | michaeld | OqBm9MLs | 078-134-4685 | 55 | michael.donovan@memortech.eu | 443-30-3442 | + Brooklyn | Avery | Roach | brooklynroach@yahoo.com | broach | IxtBLafO | 387-786-2998 | 68 | brooklyn.roach@warst.biz | 085-88-3973 | + Skylar | | Bradshaw | skylarbradshaw@yahoo.com | skylarb | p6eC6cKy | 210-872-608 | 96 | skylar.bradshaw@flyhigh.eu | 453-46-0334 | +. +. +. + +``` + +You can query the mock data. + +## Query your own data +If you want to query your own data, you need to ingest your own data first. You can write a simple producer and write custom defined data to Pulsar. The following is an example. + +```java + +public class TestProducer { + + public static class Foo { + private int field1 = 1; + private String field2; + private long field3; + + public Foo() { + } + + public int getField1() { + return field1; + } + + public void setField1(int field1) { + this.field1 = field1; + } + + public String getField2() { + return field2; + } + + public void setField2(String field2) { + this.field2 = field2; + } + + public long getField3() { + return field3; + } + + public void setField3(long field3) { + this.field3 = field3; + } + } + + public static void main(String[] args) throws Exception { + PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + Producer producer = pulsarClient.newProducer(AvroSchema.of(Foo.class)).topic("test_topic").create(); + + for (int i = 0; i < 1000; i++) { + Foo foo = new Foo(); + foo.setField1(i); + foo.setField2("foo" + i); + foo.setField3(System.currentTimeMillis()); + producer.newMessage().value(foo).send(); + } + producer.close(); + pulsarClient.close(); + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/sql-overview.md b/site2/website/versioned_docs/version-2.8.x/sql-overview.md new file mode 100644 index 0000000000000..8ba19d053003d --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/sql-overview.md @@ -0,0 +1,18 @@ +--- +id: sql-overview +title: Pulsar SQL Overview +sidebar_label: "Overview" +original_id: sql-overview +--- + +Apache Pulsar is used to store streams of event data, and the event data is structured with predefined fields. With the implementation of the [Schema Registry](schema-get-started.md), you can store structured data in Pulsar and query the data by using [Trino (formerly Presto SQL)](https://trino.io/). + +As the core of Pulsar SQL, Presto Pulsar connector enables Presto workers within a Presto cluster to query data from Pulsar. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-2.png) + +The query performance is efficient and highly scalable, because Pulsar adopts [two level segment based architecture](concepts-architecture-overview.md#apache-bookkeeper). + +Topics in Pulsar are stored as segments in [Apache BookKeeper](https://bookkeeper.apache.org/). Each topic segment is replicated to some BookKeeper nodes, which enables concurrent reads and high read throughput. You can configure the number of BookKeeper nodes, and the default number is `3`. In Presto Pulsar connector, data is read directly from BookKeeper, so Presto workers can read concurrently from horizontally scalable number BookKeeper nodes. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-1.png) diff --git a/site2/website/versioned_docs/version-2.8.x/sql-rest-api.md b/site2/website/versioned_docs/version-2.8.x/sql-rest-api.md new file mode 100644 index 0000000000000..c92fd62f7d870 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/sql-rest-api.md @@ -0,0 +1,192 @@ +--- +id: sql-rest-api +title: Pulsar SQL REST APIs +sidebar_label: "REST APIs" +original_id: sql-rest-api +--- + +This section lists resources that make up the Presto REST API v1. + +## Request for Presto services + +All requests for Presto services should use Presto REST API v1 version. + +To request services, use explicit URL `http://presto.service:8081/v1`. You need to update `presto.service:8081` with your real Presto address before sending requests. + +`POST` requests require the `X-Presto-User` header. If you use authentication, you must use the same `username` that is specified in the authentication configuration. If you do not use authentication, you can specify anything for `username`. + +```properties + +X-Presto-User: username + +``` + +For more information about headers, refer to [PrestoHeaders](https://github.com/trinodb/trino). + +## Schema + +You can use statement in the HTTP body. All data is received as JSON document that might contain a `nextUri` link. If the received JSON document contains a `nextUri` link, the request continues with the `nextUri` link until the received data does not contain a `nextUri` link. If no error is returned, the query completes successfully. If an `error` field is displayed in `stats`, it means the query fails. + +The following is an example of `show catalogs`. The query continues until the received JSON document does not contain a `nextUri` link. Since no `error` is displayed in `stats`, it means that the query completes successfully. + +```powershell + +➜ ~ curl --header "X-Presto-User: test-user" --request POST --data 'show catalogs' http://localhost:8081/v1/statement +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "stats" : { + "queued" : true, + "nodes" : 0, + "userTimeMillis" : 0, + "cpuTimeMillis" : 0, + "wallTimeMillis" : 0, + "processedBytes" : 0, + "processedRows" : 0, + "runningSplits" : 0, + "queuedTimeMillis" : 0, + "queuedSplits" : 0, + "completedSplits" : 0, + "totalSplits" : 0, + "scheduled" : false, + "peakMemoryBytes" : 0, + "state" : "QUEUED", + "elapsedTimeMillis" : 0 + }, + "id" : "20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1" +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1 +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2", + "id" : "20191113_033653_00006_dg6hb", + "stats" : { + "state" : "PLANNING", + "totalSplits" : 0, + "queued" : false, + "userTimeMillis" : 0, + "completedSplits" : 0, + "scheduled" : false, + "wallTimeMillis" : 0, + "runningSplits" : 0, + "queuedSplits" : 0, + "cpuTimeMillis" : 0, + "processedRows" : 0, + "processedBytes" : 0, + "nodes" : 0, + "queuedTimeMillis" : 1, + "elapsedTimeMillis" : 2, + "peakMemoryBytes" : 0 + } +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2 +{ + "id" : "20191113_033653_00006_dg6hb", + "data" : [ + [ + "pulsar" + ], + [ + "system" + ] + ], + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "columns" : [ + { + "typeSignature" : { + "rawType" : "varchar", + "arguments" : [ + { + "kind" : "LONG_LITERAL", + "value" : 6 + } + ], + "literalArguments" : [], + "typeArguments" : [] + }, + "name" : "Catalog", + "type" : "varchar(6)" + } + ], + "stats" : { + "wallTimeMillis" : 104, + "scheduled" : true, + "userTimeMillis" : 14, + "progressPercentage" : 100, + "totalSplits" : 19, + "nodes" : 1, + "cpuTimeMillis" : 16, + "queued" : false, + "queuedTimeMillis" : 1, + "state" : "FINISHED", + "peakMemoryBytes" : 0, + "elapsedTimeMillis" : 111, + "processedBytes" : 0, + "processedRows" : 0, + "queuedSplits" : 0, + "rootStage" : { + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1, + "subStages" : [ + { + "cpuTimeMillis" : 14, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 17, + "subStages" : [ + { + "wallTimeMillis" : 7, + "subStages" : [], + "stageId" : "2", + "done" : true, + "nodes" : 1, + "totalSplits" : 1, + "processedBytes" : 22, + "processedRows" : 2, + "queuedSplits" : 0, + "userTimeMillis" : 1, + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1 + } + ], + "wallTimeMillis" : 92, + "nodes" : 1, + "done" : true, + "stageId" : "1", + "userTimeMillis" : 12, + "processedRows" : 2, + "processedBytes" : 51, + "queuedSplits" : 0, + "totalSplits" : 17 + } + ], + "wallTimeMillis" : 5, + "done" : true, + "nodes" : 1, + "stageId" : "0", + "userTimeMillis" : 1, + "processedRows" : 2, + "processedBytes" : 22, + "totalSplits" : 1, + "queuedSplits" : 0 + }, + "runningSplits" : 0, + "completedSplits" : 19 + } +} + +``` + +:::note + +Since the response data is not in sync with the query state from the perspective of clients, you cannot rely on the response data to determine whether the query completes. + +::: + +For more information about Presto REST API, refer to [Presto HTTP Protocol](https://github.com/prestosql/presto/wiki/HTTP-Protocol). diff --git a/site2/website/versioned_docs/version-2.8.x/standalone-docker.md b/site2/website/versioned_docs/version-2.8.x/standalone-docker.md new file mode 100644 index 0000000000000..1afb9bfd3f25f --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/standalone-docker.md @@ -0,0 +1,213 @@ +--- +id: standalone-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: "Run Pulsar in Docker" +original_id: standalone-docker +--- + +For local development and testing, you can run Pulsar in standalone mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) and follow the instructions for your OS. + +## Start Pulsar in Docker + +* For MacOS, Linux, and Windows: + + ```shell + + $ docker run -it -p 6650:6650 -p 8080:8080 --mount source=pulsardata,target=/pulsar/data --mount source=pulsarconf,target=/pulsar/conf apachepulsar/pulsar:@pulsar:version@ bin/pulsar standalone + + ``` + +A few things to note about this command: + * The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every +time the container is restarted. For details on the volumes you can use `docker volume inspect ` + * For Docker on Windows make sure to configure it to use Linux containers + +If you start Pulsar successfully, you will see `INFO`-level log messages like this: + +``` + +08:18:30.970 [main] INFO org.apache.pulsar.broker.web.WebService - HTTP Service started at http://0.0.0.0:8080 +... +07:53:37.322 [main] INFO org.apache.pulsar.broker.PulsarService - messaging service is ready, bootstrap service port = 8080, broker url= pulsar://localhost:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@98b63c1 +... + +``` + +:::tip + +When you start a local standalone cluster, a `public/default` + +::: + +namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. +For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar in Docker + +Pulsar offers client libraries for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs to interact with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example will guide you get started with Pulsar quickly by using the [Python client API](client-libraries-python.md) +client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell + +$ pip install pulsar-client + +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() + +``` + +### Produce a message + +Now start a producer to send some test messages: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() + +``` + +## Get the topic statistics + +In Pulsar, you can use REST, Java, or command-line tools to control every aspect of the system. +For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell + +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool + +``` + +The output is something like this: + +```json + +{ + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesInCounter": 7097, + "msgInCounter": 143, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "averageMsgSize": 0.0, + "msgChunkPublished": false, + "storageSize": 7097, + "backlogSize": 0, + "offloadedStorageSize": 0, + "publishers": [ + { + "accessMode": "Shared", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "averageMsgSize": 0.0, + "chunkedMessageRate": 0.0, + "producerId": 0, + "metadata": {}, + "address": "/127.0.0.1:35604", + "connectedSince": "2021-07-04T09:05:43.04788Z", + "clientVersion": "2.8.0", + "producerName": "standalone-2-5" + } + ], + "waitingPublishers": 0, + "subscriptions": { + "my-sub": { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0, + "msgBacklog": 0, + "backlogSize": 0, + "msgBacklogNoDelayed": 0, + "blockedSubscriptionOnUnackedMsgs": false, + "msgDelayed": 0, + "unackedMessages": 0, + "type": "Exclusive", + "activeConsumerName": "3c544f1daa", + "msgRateExpired": 0.0, + "totalMsgExpired": 0, + "lastExpireTimestamp": 0, + "lastConsumedFlowTimestamp": 1625389101290, + "lastConsumedTimestamp": 1625389546070, + "lastAckedTimestamp": 1625389546162, + "lastMarkDeleteAdvancedTimestamp": 1625389546163, + "consumers": [ + { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0.0, + "consumerName": "3c544f1daa", + "availablePermits": 867, + "unackedMessages": 0, + "avgMessagesPerEntry": 6, + "blockedConsumerOnUnackedMsgs": false, + "lastAckedTimestamp": 1625389546162, + "lastConsumedTimestamp": 1625389546070, + "metadata": {}, + "address": "/127.0.0.1:35472", + "connectedSince": "2021-07-04T08:58:21.287682Z", + "clientVersion": "2.8.0" + } + ], + "isDurable": true, + "isReplicated": false, + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition": {}, + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0, + "durable": true, + "replicated": false + } + }, + "replication": {}, + "deduplicationStatus": "Disabled", + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0 +} + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/standalone.md b/site2/website/versioned_docs/version-2.8.x/standalone.md new file mode 100644 index 0000000000000..e06d49c591fe3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/standalone.md @@ -0,0 +1,271 @@ +--- +id: standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> #### Pulsar in production? +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of the installation process. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker +(or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), +you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), +you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@95] - Configuration Store cache started +2017-06-01 14:46:29,192 - INFO - [main:AuthenticationService@61] - Authentication is disabled +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@108] - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +09:56:55.566 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.MultiTopicsConsumerImpl - [TopicsConsumerFakeTopicNamee2df9] [first-subscription] Success subscribe new topic my-topic in topics consumer, partitions: 4, allTopicPartitionsNumber: 4 + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +13:09:39.356 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-aliyun.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-aliyun.md new file mode 100644 index 0000000000000..5772f162b5e26 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-aliyun.md @@ -0,0 +1,257 @@ +--- +id: tiered-storage-aliyun +title: Use Aliyun OSS offloader with Pulsar +sidebar_label: "Aliyun OSS offloader" +original_id: tiered-storage-aliyun +--- + +This chapter guides you through every step of installing and configuring the Aliyun Object Storage Service (OSS) offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Aliyun OSS offloader. + +### Prerequisite + +- Pulsar: 2.8.0 or later versions + +### Step + +This example uses Pulsar 2.8.0. + +1. Download the Pulsar tarball, see [here](https://pulsar.apache.org/docs/en/standalone/#install-pulsar-using-binary-release). + +2. Download and untar the Pulsar offloaders package, then copy the Pulsar offloaders as `offloaders` in the Pulsar directory, see [here](https://pulsar.apache.org/docs/en/standalone/#install-tiered-storage-offloaders-optional). + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/), [Azure](https://portal.azure.com/#home), and [Aliyun OSS](https://www.aliyun.com/product/oss) for long-term storage. + + ``` + + tiered-storage-file-system-2.8.0.nar + tiered-storage-jcloud-2.8.0.nar + + ``` + + :::note + + * If you are running Pulsar in a bare-metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Aliyun OSS, you need to configure some properties of the Aliyun OSS offload driver. + +::: + +Besides, you can also configure the Aliyun OSS offloader to run it automatically or trigger it manually. + +### Configure Aliyun OSS offloader driver + +You can configure the Aliyun OSS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + | Required configuration | Description | Example value | + | --- | --- |--- | + | `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | aliyun-oss | + | `offloadersDirectory` | Offloader directory | offloaders | + | `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload | + | `managedLedgerOffloadServiceEndpoint` | Endpoint | http://oss-cn-hongkong.aliyuncs.com | + +- **Optional** configurations are as below. + + | Optional | Description | Example value | + | --- | --- | --- | + | `managedLedgerOffloadReadBufferSizeInBytes` | Size of block read | 1 MB | + | `managedLedgerOffloadMaxBlockSizeInBytes` | Size of block write | 64 MB | + | `managedLedgerMinLedgerRolloverTimeMinutes` | Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment. | 2 | + | `managedLedgerMaxEntriesPerLedger` | Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment. | 5000 | + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Aliyun OSS must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Endpoint (required) + +The endpoint is the region where a bucket is located. + +:::tip + +For more information about Aliyun OSS regions and endpoints, see [International website](https://www.alibabacloud.com/help/doc-detail/31837.htm) or [Chinese website](https://help.aliyun.com/document_detail/31837.html). + +::: + + +##### Example + +This example sets the endpoint as _oss-us-west-1-internal_. + +``` + +managedLedgerOffloadServiceEndpoint=http://oss-us-west-1-internal.aliyuncs.com + +``` + +#### Authentication (required) + +To be able to access Aliyun OSS, you need to authenticate with Aliyun OSS. + +Set the environment variables `ALIYUN_OSS_ACCESS_KEY_ID` and `ALIYUN_OSS_ACCESS_KEY_SECRET` in `conf/pulsar_env.sh`. + +"export" is important so that the variables are made available in the environment of spawned processes. + +```bash + +export ALIYUN_OSS_ACCESS_KEY_ID=ABC123456789 +export ALIYUN_OSS_ACCESS_KEY_SECRET=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Aliyun OSS in the configuration file `broker.conf` or `standalone.conf`. + +| Configuration | Description | Default value | +| --- | --- | --- | +| `managedLedgerOffloadReadBufferSizeInBytes` | Block size for each individual read when reading back data from Aliyun OSS. | 1 MB | +| `managedLedgerOffloadMaxBlockSizeInBytes` | Maximum size of a "part" sent during a multipart upload to Aliyun OSS. It **cannot** be smaller than 5 MB. | 64 MB | + +### Run Aliyun OSS offloader automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +| Threshold value | Action | +| --- | --- | +| > 0 | It triggers the offloading operation if the topic storage reaches its threshold. | +| = 0 | It causes a broker to offload data as soon as possible. | +| < 0 | It disables automatic offloading operation. | + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, the offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the Aliyun OSS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Run Aliyun OSS offloader manually + +For individual topics, you can trigger the Aliyun OSS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Aliyun OSS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Aliyun OSS offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the Aliyun OSS offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Aliyun OSS offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-aws.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-aws.md new file mode 100644 index 0000000000000..a83de62643638 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-aws.md @@ -0,0 +1,329 @@ +--- +id: tiered-storage-aws +title: Use AWS S3 offloader with Pulsar +sidebar_label: "AWS S3 offloader" +original_id: tiered-storage-aws +--- + +This chapter guides you through every step of installing and configuring the AWS S3 offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the AWS S3 offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [downloads page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/) and [GCS](https://cloud.google.com/storage/) for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to AWS S3, you need to configure some properties of the AWS S3 offload driver. + +::: + +Besides, you can also configure the AWS S3 offloader to run it automatically or trigger it manually. + +### Configure AWS S3 offloader driver + +You can configure the AWS S3 offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive.

    **Note**: there is a third driver type, S3, which is identical to AWS S3, though S3 requires that you specify an endpoint URL using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if using an S3 compatible data store other than AWS S3. | aws-s3 + `offloadersDirectory` | Offloader directory | offloaders + `s3ManagedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `s3ManagedLedgerOffloadRegion` | Bucket region

    **Note**: before specifying a value for this parameter, you need to set the following configurations. Otherwise, you might get an error.

    - Set [`s3ManagedLedgerOffloadServiceEndpoint`](https://docs.aws.amazon.com/general/latest/gr/s3.html).

    Example
    `s3ManagedLedgerOffloadServiceEndpoint=https://s3.YOUR_REGION.amazonaws.com`

    - Grant `GetBucketLocation` permission to a user.

    For how to grant `GetBucketLocation` permission to a user, see [here](https://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html#using-with-s3-actions-related-to-buckets).| eu-west-3 + `s3ManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in AWS S3 must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region + +A bucket region is a region where a bucket is located. If a bucket region is not specified, the **default** region (`US East (N. Virginia)`) is used. + +:::tip + +For more information about AWS regions and endpoints, see [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). + +::: + + +##### Example + +This example sets the bucket region as _europe-west-3_. + +``` + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication (required) + +To be able to access AWS S3, you need to authenticate with AWS S3. + +Pulsar does not provide any direct methods of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, you can configure credentials using one of the following methods. + +* Use EC2 instance metadata credentials. + + If you are on AWS instance with an instance profile that provides credentials, Pulsar uses these credentials if no other mechanism is provided. + +* Set the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AWS_ACCESS_KEY_ID=ABC123456789 + export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Add the Java system properties `aws.accessKeyId` and `aws.secretKey` to `PULSAR_EXTRA_OPTS` in `conf/pulsar_env.sh`. + + ```bash + + PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacityPerThread=4096" + + ``` + +* Set the access credentials in `~/.aws/credentials`. + + ```conf + + [default] + aws_access_key_id=ABC123456789 + aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Assume an IAM role. + + This example uses the `DefaultAWSCredentialsProviderChain` for assuming this role. + + The broker must be rebooted for credentials specified in `pulsar_env` to take effect. + + ```conf + + s3ManagedLedgerOffloadRole= + s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from AWS S3 in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`s3ManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from AWS S3.|1 MB +`s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to AWS S3. It **cannot** be smaller than 5 MB. |64 MB + +### Configure AWS S3 offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the AWS S3 offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Configure AWS S3 offloader to run manually + +For individual topics, you can trigger AWS S3 offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to AWS S3 until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the AWS S3 offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the AWS S3 offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the AWS S3 offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the AWS S3 offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/aws-s3/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-azure.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-azure.md new file mode 100644 index 0000000000000..e1485af3984e3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-azure.md @@ -0,0 +1,264 @@ +--- +id: tiered-storage-azure +title: Use Azure BlobStore offloader with Pulsar +sidebar_label: "Azure BlobStore offloader" +original_id: tiered-storage-azure +--- + +This chapter guides you through every step of installing and configuring the Azure BlobStore offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Azure BlobStore offloader. + +### Prerequisite + +- Pulsar: 2.6.2 or later versions + +### Step + +This example uses Pulsar 2.6.2. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz) + + * Download from the Pulsar [downloads page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.6.2/apache-pulsar-offloaders-2.6.2-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.6.2-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.6.2/offloaders apache-pulsar-2.6.2/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/) and [Azure](https://portal.azure.com/#home) for long term storage. + + ``` + + tiered-storage-file-system-2.6.2.nar + tiered-storage-jcloud-2.6.2.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Azure BlobStore, you need to configure some properties of the Azure BlobStore offload driver. + +::: + +Besides, you can also configure the Azure BlobStore offloader to run it automatically or trigger it manually. + +### Configure Azure BlobStore offloader driver + +You can configure the Azure BlobStore offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name | azureblob + `offloadersDirectory` | Offloader directory | offloaders + `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `managedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `managedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Azure BlobStore must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Authentication (required) + +To be able to access Azure BlobStore, you need to authenticate with Azure BlobStore. + +* Set the environment variables `AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AZURE_STORAGE_ACCOUNT=ABC123456789 + export AZURE_STORAGE_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Azure BlobStore in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`managedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from Azure BlobStore store.|1 MB +`managedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to Azure BlobStore store. It **cannot** be smaller than 5 MB. |64 MB + +### Configure Azure BlobStore offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the Azure BlobStore offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Configure Azure BlobStore offloader to run manually + +For individual topics, you can trigger Azure BlobStore offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Azure BlobStore until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Azure BlobStore offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the Azure BlobStore offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Azure BlobStore offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-filesystem.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-filesystem.md new file mode 100644 index 0000000000000..4456b615afaf2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-filesystem.md @@ -0,0 +1,630 @@ +--- +id: tiered-storage-filesystem +title: Use filesystem offloader with Pulsar +sidebar_label: "Filesystem offloader" +original_id: tiered-storage-filesystem +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This chapter guides you through every step of installing and configuring the filesystem offloader and using it with Pulsar. + +## Installation + +This section describes how to install the filesystem offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or higher versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download the Pulsar tarball from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download the Pulsar tarball from the Pulsar [download page](https://pulsar.apache.org/download) + + * Use the [wget](https://www.gnu.org/software/wget) command to dowload the Pulsar tarball. + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you run Pulsar in a bare metal cluster, ensure that the `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you run Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you run Pulsar in a bare metal cluster, ensure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you run Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to filesystem, you need to configure some properties of the filesystem offloader driver. + +::: + +Besides, you can also configure the filesystem offloader to run it automatically or trigger it manually. + +### Configure filesystem offloader driver + +You can configure the filesystem offloader driver in the `broker.conf` or `standalone.conf` configuration file. + +````mdx-code-block + + + +- **Required** configurations are as below. + + Parameter | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | filesystem + `fileSystemURI` | Connection address, which is the URI to access the default Hadoop distributed file system. | hdfs://127.0.0.1:9000 + `offloadersDirectory` | Offloader directory | offloaders + `fileSystemProfilePath` | Hadoop profile path. The configuration file is stored in the Hadoop profile path. It contains various settings for Hadoop performance tuning. | ../conf/filesystem_offload_core_site.xml + +- **Optional** configurations are as below. + + Parameter| Description | Example value + |---|---|--- + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.

    **Note**: it is not recommended to set this parameter in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended to set this parameter in the production environment.|5000 + +
    + + +- **Required** configurations are as below. + + Parameter | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | filesystem + `offloadersDirectory` | Offloader directory | offloaders + `fileSystemProfilePath` | NFS profile path. The configuration file is stored in the NFS profile path. It contains various settings for performance tuning. | ../conf/filesystem_offload_core_site.xml + +- **Optional** configurations are as below. + + Parameter| Description | Example value + |---|---|--- + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.

    **Note**: it is not recommended to set this parameter in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended to set this parameter in the production environment.|5000 + +
    + +
    +```` + +### Run filesystem offloader automatically + +You can configure the namespace policy to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic storage reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +| > 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, the filesystem offloader does not work until the current segment is full. + +You can configure the threshold using CLI tools, such as pulsar-admin. + +#### Example + +This example sets the filesystem offloader threshold to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Run filesystem offloader manually + +For individual topics, you can trigger the filesystem offloader manually using one of the following methods: + +- Use the REST endpoint. + +- Use CLI tools (such as pulsar-admin). + +To manually trigger the filesystem offloader via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are offloaded to the filesystem until the threshold is no longer exceeded. Older segments are offloaded first. + +#### Example + +- This example manually run the filesystem offloader using pulsar-admin. + + ```bash + + pulsar-admin topics offload --size-threshold 10M persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks filesystem offloader status using pulsar-admin. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the filesystem to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in the offloading operation, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +This section provides step-by-step instructions on how to use the filesystem offloader to move data from Pulsar to Hadoop Distributed File System (HDFS) or Network File system (NFS). + +````mdx-code-block + + + +To move data from Pulsar to HDFS, follow these steps. + +### Step 1: Prepare the HDFS environment + +This tutorial sets up a Hadoop single node cluster and uses Hadoop 3.2.1. + +:::tip + +For details about how to set up a Hadoop single node cluster, see [here](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html). + +::: + +1. Download and uncompress Hadoop 3.2.1. + + ``` + + wget https://mirrors.bfsu.edu.cn/apache/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz + + tar -zxvf hadoop-3.2.1.tar.gz -C $HADOOP_HOME + + ``` + +2. Configure Hadoop. + + ``` + + # $HADOOP_HOME/etc/hadoop/core-site.xml + + + fs.defaultFS + hdfs://localhost:9000 + + + + # $HADOOP_HOME/etc/hadoop/hdfs-site.xml + + + dfs.replication + 1 + + + + ``` + +3. Set passphraseless ssh. + + ``` + + # Now check that you can ssh to the localhost without a passphrase: + $ ssh localhost + # If you cannot ssh to localhost without a passphrase, execute the following commands + $ ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa + $ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + $ chmod 0600 ~/.ssh/authorized_keys + + ``` + +4. Start HDFS. + + ``` + + # don't execute this command repeatedly, repeat execute will cauld the clusterId of the datanode is not consistent with namenode + $HADOOP_HOME/bin/hadoop namenode -format + $HADOOP_HOME/sbin/start-dfs.sh + + ``` + +5. Navigate to the [HDFS website](http://localhost:9870/). + + You can see the **Overview** page. + + ![](/assets/FileSystem-1.png) + + 1. At the top navigation bar, click **Datanodes** to check DataNode information. + + ![](/assets/FileSystem-2.png) + + 2. Click **HTTP Address** to get more detailed information about localhost:9866. + + As can be seen below, the size of **Capacity Used** is 4 KB, which is the initial value. + + ![](/assets/FileSystem-3.png) + +### Step 2: Install the filesystem offloader + +For details, see [installation](#installation). + +### Step 3: Configure the filesystem offloader + +As indicated in the [configuration](#configuration) section, you need to configure some properties for the filesystem offloader driver before using it. This tutorial assumes that you have configured the filesystem offloader driver as below and run Pulsar in **standalone** mode. + +Set the following configurations in the `conf/standalone.conf` file. + +```conf + +managedLedgerOffloadDriver=filesystem +fileSystemURI=hdfs://127.0.0.1:9000 +fileSystemProfilePath=../conf/filesystem_offload_core_site.xml + +``` + +:::note + +For testing purposes, you can set the following two configurations to speed up ledger rollover, but it is not recommended that you set them in the production environment. + +::: + +``` + +managedLedgerMinLedgerRolloverTimeMinutes=1 +managedLedgerMaxEntriesPerLedger=100 + +``` + + + + +:::note + +In this section, it is assumed that you have enabled NFS service and set the shared path of your NFS service. In this section, `/Users/test` is used as the shared path of NFS service. + +::: + +To offload data to NFS, follow these steps. + +### Step 1: Install the filesystem offloader + +For details, see [installation](#installation). + +### Step 2: Mont your NFS to your local filesystem + +This example mounts mounts */Users/pulsar_nfs* to */Users/test*. + +``` + +mount -e 192.168.0.103:/Users/test/Users/pulsar_nfs + +``` + +### Step 3: Configure the filesystem offloader driver + +As indicated in the [configuration](#configuration) section, you need to configure some properties for the filesystem offloader driver before using it. This tutorial assumes that you have configured the filesystem offloader driver as below and run Pulsar in **standalone** mode. + +1. Set the following configurations in the `conf/standalone.conf` file. + + ```conf + + managedLedgerOffloadDriver=filesystem + fileSystemProfilePath=../conf/filesystem_offload_core_site.xml + + ``` + +2. Modify the *filesystem_offload_core_site.xml* as follows. + + ``` + + + fs.defaultFS + file:/// + + + + hadoop.tmp.dir + file:///Users/pulsar_nfs + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + + ``` + + + + +```` + +### Step 4: Offload data from BookKeeper to filesystem + +Execute the following commands in the repository where you download Pulsar tarball. For example, `~/path/to/apache-pulsar-2.5.1`. + +1. Start Pulsar standalone. + + ``` + + bin/pulsar standalone -a 127.0.0.1 + + ``` + +2. To ensure the data generated is not deleted immediately, it is recommended to set the [retention policy](https://pulsar.apache.org/docs/en/next/cookbooks-retention-expiry/#retention-policies), which can be either a **size** limit or a **time** limit. The larger value you set for the retention policy, the longer the data can be retained. + + ``` + + bin/pulsar-admin namespaces set-retention public/default --size 100M --time 2d + + ``` + + :::tip + + For more information about the `pulsarctl namespaces set-retention options` command, including flags, descriptions, default values, and shorthands, see [here](https://docs.streamnative.io/pulsarctl/v2.7.0.6/#-em-set-retention-em-). + + ::: + +3. Produce data using pulsar-client. + + ``` + + bin/pulsar-client produce -m "Hello FileSystem Offloader" -n 1000 public/default/fs-test + + ``` + +4. The offloading operation starts after a ledger rollover is triggered. To ensure offload data successfully, it is recommended that you wait until several ledger rollovers are triggered. In this case, you might need to wait for a second. You can check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The data of the ledger 696 is not offloaded. + + ``` + + { + "version": 1, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:46:25.821+08:00", + "ledgers": [ + { + "ledgerId": 696, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + +5. Wait a second and send more messages to the topic. + + ``` + + bin/pulsar-client produce -m "Hello FileSystem Offloader" -n 1000 public/default/fs-test + + ``` + +6. Check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The ledger 696 is rolled over. + + ``` + + { + "version": 2, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:48:52.288+08:00", + "ledgers": [ + { + "ledgerId": 696, + "entries": 1001, + "size": 81695, + "isOffloaded": false + }, + { + "ledgerId": 697, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + +7. Trigger the offloading operation manually using pulsarctl. + + ``` + + bin/pulsar-admin topics offload -s 0 public/default/fs-test + + ``` + + **Output** + + Data in ledgers before the ledge 697 is offloaded. + + ``` + + # offload info, the ledgers before 697 will be offloaded + Offload triggered for persistent://public/default/fs-test3 for messages before 697:0:-1 + + ``` + +8. Check the ledger status using pulsarctl. + + ``` + + bin/pulsar-admin topics stats-internal public/default/fs-test + + ``` + + **Output** + + The data of the ledger 696 is offloaded. + + ``` + + { + "version": 4, + "creationDate": "2020-06-16T21:46:25.807+08:00", + "modificationDate": "2020-06-16T21:52:13.25+08:00", + "ledgers": [ + { + "ledgerId": 696, + "entries": 1001, + "size": 81695, + "isOffloaded": true + }, + { + "ledgerId": 697, + "isOffloaded": false + } + ], + "cursors": {} + } + + ``` + + And the **Capacity Used** is changed from 4 KB to 116.46 KB. + + ![](/assets/FileSystem-8.png) \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-gcs.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-gcs.md new file mode 100644 index 0000000000000..81e7c5c6e6a44 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-gcs.md @@ -0,0 +1,319 @@ +--- +id: tiered-storage-gcs +title: Use GCS offloader with Pulsar +sidebar_label: "GCS offloader" +original_id: tiered-storage-gcs +--- + +This chapter guides you through every step of installing and configuring the GCS offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the GCS offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [download page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget) + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown in the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support GCS and AWS S3 for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + +## Configuration + +:::note + +Before offloading data from BookKeeper to GCS, you need to configure some properties of the GCS offloader driver. + +::: + +Besides, you can also configure the GCS offloader to run it automatically or trigger it manually. + +### Configure GCS offloader driver + +You can configure GCS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + **Required** configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver`|Offloader driver name, which is case-insensitive.|google-cloud-storage + `offloadersDirectory`|Offloader directory|offloaders + `gcsManagedLedgerOffloadBucket`|Bucket|pulsar-topic-offload + `gcsManagedLedgerOffloadRegion`|Bucket region|europe-west3 + `gcsManagedLedgerOffloadServiceAccountKeyFile`|Authentication |/Users/user-name/Downloads/project-804d5e6a6f33.json + +- **Optional** configurations are as below. + + Optional configuration|Description|Example value + |---|---|--- + `gcsManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.|2 + `managedLedgerMaxEntriesPerLedger`|The max number of entries to append to a ledger before triggering a rollover.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in GCS **must** be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you can not nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region (required) + +Bucket region is the region where a bucket is located. If a bucket region is not specified, the **default** region (`us multi-regional location`) is used. + +:::tip + +For more information about bucket location, see [here](https://cloud.google.com/storage/docs/bucket-locations). + +::: + +##### Example + +This example sets the bucket region as _europe-west3_. + +``` + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication (required) + +To enable a broker access GCS, you need to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in the configuration file `broker.conf`. + +`gcsManagedLedgerOffloadServiceAccountKeyFile` is +a JSON file, containing GCS credentials of a service account. + +##### Example + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps. + +1. Navigate to the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). + +2. Select a project or create a new one. + +3. Click **Create service account**. + +4. In the **Create service account** window, type a name for the service account and select **Furnish a new private key**. + + If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, select **Enable G Suite Domain-wide Delegation**. + +5. Click **Create**. + + :::note + + Make sure the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +6. You can get the following information and set this in `broker.conf`. + + ```conf + + gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/user-name/Downloads/project-804d5e6a6f33.json" + + ``` + + :::tip + + - For more information about how to create `gcsManagedLedgerOffloadServiceAccountKeyFile`, see [here](https://support.google.com/googleapi/answer/6158849). + - For more information about Google Cloud IAM, see [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +#### Size of block read/write + +You can configure the size of a request sent to or read from GCS in the configuration file `broker.conf`. + +Configuration|Description +|---|--- +`gcsManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from GCS.

    The **default** value is 1 MB. +`gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to GCS.

    It **can not** be smaller than 5 MB.

    The **default** value is 64 MB. + +### Configure GCS offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the GCS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Configure GCS offloader to run manually + +For individual topics, you can trigger GCS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger the GCS via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to GCS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the GCS offloader to run manually using pulsar-admin with the command `pulsar-admin topics offload (topic-name) (threshold)`. + + ```bash + + pulsar-admin topics offload persistent://my-tenant/my-namespace/topic1 10M + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks the GCS offloader status using pulsar-admin with the command `pulsar-admin topics offload-status options`. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for GCS to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the GCS offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/gcs/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/tiered-storage-overview.md b/site2/website/versioned_docs/version-2.8.x/tiered-storage-overview.md new file mode 100644 index 0000000000000..c635034f463b4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/tiered-storage-overview.md @@ -0,0 +1,52 @@ +--- +id: tiered-storage-overview +title: Overview of tiered storage +sidebar_label: "Overview" +original_id: tiered-storage-overview +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be moved from BookKeeper to long term and cheaper storage, while still allowing clients to access the backlog as if nothing has changed. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [GCS (Google Cloud Storage)](https://cloud.google.com/storage/) for long term storage. + + With jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + + :::tip + + - For more information about how to use the AWS S3 offloader with Pulsar, see [here](tiered-storage-aws.md). + + - For more information about how to use the GCS offloader with Pulsar, see [here](tiered-storage-gcs.md). + + ::: + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystems for long term storage. + + With Hadoop, it is easy to add support for more filesystems in the future. + + :::tip + + For more information about how to use the filesystem offloader with Pulsar, see [here](tiered-storage-filesystem.md). + + ::: + +## When to use tiered storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. + +For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm, you can rerun it against your full user history. + +## How does tiered storage work? + +A topic in Pulsar is backed by a **log**, known as a **managed ledger**. This log is composed of an ordered list of segments. Pulsar only writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a **segment oriented architecture**. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The tiered storage offloading mechanism takes advantage of segment oriented architecture. When offloading is requested, the segments of the log are copied one-by-one to tiered storage. All segments of the log (apart from the current segment) written to tiered storage can be offloaded. + +Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper, it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Before offloading ledgers to long term storage, you need to configure buckets, credentials, and other properties for the cloud storage service. Additionally, Pulsar uses multi-part objects to upload the segment data and brokers may crash while uploading the data. It is recommended that you add a life cycle rule for your bucket to expire incomplete multi-part upload after a day or two days to avoid getting charged for incomplete uploads. Moreover, you can trigger the offloading operation manually (via REST API or CLI) or automatically (via CLI). + +After offloading ledgers to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +For more information about tiered storage for Pulsar topics, see [here](https://github.com/apache/pulsar/wiki/PIP-17:-Tiered-storage-for-Pulsar-topics). diff --git a/site2/website/versioned_docs/version-2.8.x/transaction-api.md b/site2/website/versioned_docs/version-2.8.x/transaction-api.md new file mode 100644 index 0000000000000..fedc314646c93 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/transaction-api.md @@ -0,0 +1,172 @@ +--- +id: transactions-api +title: Transactions API +sidebar_label: "Transactions API" +original_id: transactions-api +--- + +All messages in a transaction are available only to consumers after the transaction has been committed. If a transaction has been aborted, all the writes and acknowledgments in this transaction roll back. + +## Prerequisites + +1. To enable transactions in Pulsar, you need to configure the parameter in `broker.conf` file or `standalone.conf` file. + +``` + +transactionCoordinatorEnabled=true + +``` + +2. Initialize transaction coordinator metadata, so the transaction coordinators can leverage advantages of the partitioned topic, such as load balance. + +``` + +bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + +``` + +After initializing transaction coordinator metadata, you can use the transactions API. The following APIs are available. + +## Initialize Pulsar client + +You can enable transaction for transaction client and initialize transaction coordinator client. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .enableTransaction(true) + .build(); + +``` + +## Start transactions +You can start transaction in the following way. + +``` + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +``` + +## Produce transaction messages + +A transaction parameter is required when producing new transaction messages. The semantic of the transaction messages in Pulsar is `read-committed`, so the consumer cannot receive the ongoing transaction messages before the transaction is committed. + +``` + +producer.newMessage(txn).value("Hello Pulsar Transaction".getBytes()).sendAsync(); + +``` + +## Acknowledge the messages with the transaction + +The transaction acknowledgement requires a transaction parameter. The transaction acknowledgement marks the messages state to pending-ack state. When the transaction is committed, the pending-ack state becomes ack state. If the transaction is aborted, the pending-ack state becomes unack state. + +``` + +Message message = consumer.receive(); +consumer.acknowledgeAsync(message.getMessageId(), txn); + +``` + +## Commit transactions + +When the transaction is committed, consumers receive the transaction messages and the pending-ack state becomes ack state. + +``` + +txn.commit().get(); + +``` + +## Abort transaction + +When the transaction is aborted, the transaction acknowledgement is canceled and the pending-ack messages are redelivered. + +``` + +txn.abort().get(); + +``` + +### Example +The following example shows how messages are processed in transaction. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl(getPulsarServiceList().get(0).getBrokerServiceUrl()) + .statsInterval(0, TimeUnit.SECONDS) + .enableTransaction(true) + .build(); + +String sourceTopic = "public/default/source-topic"; +String sinkTopic = "public/default/sink-topic"; + +Producer sourceProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sourceTopic) + .create(); +sourceProducer.newMessage().value("hello pulsar transaction").sendAsync(); + +Consumer sourceConsumer = pulsarClient + .newConsumer(Schema.STRING) + .topic(sourceTopic) + .subscriptionName("test") + .subscriptionType(SubscriptionType.Shared) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +Producer sinkProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sinkTopic) + .create(); + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +// source message acknowledgement and sink message produce belong to one transaction, +// they are combined into an atomic operation. +Message message = sourceConsumer.receive(); +sourceConsumer.acknowledgeAsync(message.getMessageId(), txn); +sinkProducer.newMessage(txn).value("sink data").sendAsync(); + +txn.commit().get(); + +``` + +## Enable batch messages in transactions + +To enable batch messages in transactions, you need to enable the batch index acknowledgement feature. The transaction acks check whether the batch index acknowledgement conflicts. + +To enable batch index acknowledgement, you need to set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + +``` + +acknowledgmentAtBatchIndexLevelEnabled=true + +``` + +And then you need to call the `enableBatchIndexAcknowledgment(true)` method in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/transaction-guarantee.md b/site2/website/versioned_docs/version-2.8.x/transaction-guarantee.md new file mode 100644 index 0000000000000..9db2d254e159f --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/transaction-guarantee.md @@ -0,0 +1,17 @@ +--- +id: transactions-guarantee +title: Transactions Guarantee +sidebar_label: "Transactions Guarantee" +original_id: transactions-guarantee +--- + +Pulsar transactions support the following guarantee. + +## Atomic multi-partition writes and multi-subscription acknowledges +Transactions enable atomic writes to multiple topics and partitions. A batch of messages in a transaction can be received from, produced to, and acknowledged by many partitions. All the operations involved in a transaction succeed or fail as a single unit. + +## Read transactional message +All the messages in a transaction are available only for consumers until the transaction is committed. + +## Acknowledge transactional message +A message is acknowledged successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/txn-how.md b/site2/website/versioned_docs/version-2.8.x/txn-how.md new file mode 100644 index 0000000000000..add072448aeb3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/txn-how.md @@ -0,0 +1,151 @@ +--- +id: txn-how +title: How transactions work? +sidebar_label: "How transactions work?" +original_id: txn-how +--- + +This section describes transaction components and how the components work together. For the complete design details, see [PIP-31: Transactional Streaming](https://docs.google.com/document/d/145VYp09JKTw9jAT-7yNyFU255FptB2_B2Fye100ZXDI/edit#heading=h.bm5ainqxosrx). + +## Key concept + +It is important to know the following key concepts, which is a prerequisite for understanding how transactions work. + +### Transaction coordinator + +The transaction coordinator (TC) is a module running inside a Pulsar broker. + +* It maintains the entire life cycle of transactions and prevents a transaction from getting into an incorrect status. + +* It handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +### Transaction log + +All the transaction metadata persists in the transaction log. The transaction log is backed by a Pulsar topic. If the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +The transaction log stores the transaction status rather than actual messages in the transaction (the actual messages are stored in the actual topic partitions). + +### Transaction buffer + +Messages produced to a topic partition within a transaction are stored in the transaction buffer (TB) of that topic partition. The messages in the transaction buffer are not visible to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +Transaction buffer stores all ongoing and aborted transactions in memory. All messages are sent to the actual partitioned Pulsar topics. After transactions are committed, the messages in the transaction buffer are materialized (visible) to consumers. When the transactions are aborted, the messages in the transaction buffer are discarded. + +### Transaction ID + +Transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +### Pending acknowledge state + +Pending acknowledge state maintains message acknowledgments within a transaction before a transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log (cursor ledger). A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. + +## Data flow + +At a high level, the data flow can be split into several steps: + +1. Begin a transaction. + +2. Publish messages with a transaction. + +3. Acknowledge messages with a transaction. + +4. End a transaction. + +To help you debug or tune the transaction for better performance, review the following diagrams and descriptions. + +### 1. Begin a transaction + +Before introducing the transaction in Pulsar, a producer is created and then messages are sent to brokers and stored in data logs. + +![](/assets/txn-3.png) + +Let’s walk through the steps for _beginning a transaction_. + +| Step | Description | +| --- | --- | +| 1.1 | The first step is that the Pulsar client finds the transaction coordinator. | +| 1.2 | The transaction coordinator allocates a transaction ID for the transaction. In the transaction log, the transaction is logged with its transaction ID and status (OPEN), which ensures the transaction status is persisted regardless of transaction coordinator crashes. | +| 1.3 | The transaction log sends the result of persisting the transaction ID to the transaction coordinator. | +| 1.4 | After the transaction status entry is logged, the transaction coordinator brings the transaction ID back to the Pulsar client. | + +### 2. Publish messages with a transaction + +In this stage, the Pulsar client enters a transaction loop, repeating the `consume-process-produce` operation for all the messages that comprise the transaction. This is a long phase and is potentially composed of multiple produce and acknowledgement requests. + +![](/assets/txn-4.png) + +Let’s walk through the steps for _publishing messages with a transaction_. + +| Step | Description | +| --- | --- | +| 2.1.1 | Before the Pulsar client produces messages to a new topic partition, it sends a request to the transaction coordinator to add the partition to the transaction. | +| 2.1.2 | The transaction coordinator logs the partition changes of the transaction into the transaction log for durability, which ensures the transaction coordinator knows all the partitions that a transaction is handling. The transaction coordinator can commit or abort changes on each partition at the end-partition phase. | +| 2.1.3 | The transaction log sends the result of logging the new partition (used for producing messages) to the transaction coordinator. | +| 2.1.4 | The transaction coordinator sends the result of adding a new produced partition to the transaction. | +| 2.2.1 | The Pulsar client starts producing messages to partitions. The flow of this part is the same as the normal flow of producing messages except that the batch of messages produced by a transaction contains transaction IDs. | +| 2.2.2 | The broker writes messages to a partition. | + +### 3. Acknowledge messages with a transaction + +In this phase, the Pulsar client sends a request to the transaction coordinator and a new subscription is acknowledged as a part of a transaction. + +![](/assets/txn-5.png) + +Let’s walk through the steps for _acknowledging messages with a transaction_. + +| Step | Description | +| --- | --- | +| 3.1.1 | The Pulsar client sends a request to add an acknowledged subscription to the transaction coordinator. | +| 3.1.2 | The transaction coordinator logs the addition of subscription, which ensures that it knows all subscriptions handled by a transaction and can commit or abort changes on each subscription at the end phase. | +| 3.1.3 | The transaction log sends the result of logging the new partition (used for acknowledging messages) to the transaction coordinator. | +| 3.1.4 | The transaction coordinator sends the result of adding the new acknowledged partition to the transaction. | +| 3.2 | The Pulsar client acknowledges messages on the subscription. The flow of this part is the same as the normal flow of acknowledging messages except that the acknowledged request carries a transaction ID. | +| 3.3 | The broker receiving the acknowledgement request checks if the acknowledgment belongs to a transaction or not. | + +### 4. End a transaction + +At the end of a transaction, the Pulsar client decides to commit or abort the transaction. The transaction can be aborted when a conflict is detected on acknowledging messages. + +#### 4.1 End transaction request + +When the Pulsar client finishes a transaction, it issues an end transaction request. + +![](/assets/txn-6.png) + +Let’s walk through the steps for _ending the transaction_. + +| Step | Description | +| --- | --- | +| 4.1.1 | The Pulsar client issues an end transaction request (with a field indicating whether the transaction is to be committed or aborted) to the transaction coordinator. | +| 4.1.2 | The transaction coordinator writes a COMMITTING or ABORTING message to its transaction log. | +| 4.1.3 | The transaction log sends the result of logging the committing or aborting status. | + +#### 4.2 Finalize a transaction + +The transaction coordinator starts the process of committing or aborting messages to all the partitions involved in this transaction. + +![](/assets/txn-7.png) + +Let’s walk through the steps for _finalizing a transaction_. + +| Step | Description | +| --- | --- | +| 4.2.1 | The transaction coordinator commits transactions on subscriptions and commits transactions on partitions at the same time. | +| 4.2.2 | The broker (produce) writes produced committed markers to the actual partitions. At the same time, the broker (ack) writes acked committed marks to the subscription pending ack partitions. | +| 4.2.3 | The data log sends the result of writing produced committed marks to the broker. At the same time, pending ack data log sends the result of writing acked committed marks to the broker. The cursor moves to the next position. | + +#### 4.3 Mark a transaction as COMMITTED or ABORTED + +The transaction coordinator writes the final transaction status to the transaction log to complete the transaction. + +![](/assets/txn-8.png) + +Let’s walk through the steps for _marking a transaction as COMMITTED or ABORTED_. + +| Step | Description | +| --- | --- | +| 4.3.1 | After all produced messages and acknowledgements to all partitions involved in this transaction have been successfully committed or aborted, the transaction coordinator writes the final COMMITTED or ABORTED transaction status messages to its transaction log, indicating that the transaction is complete. All the messages associated with the transaction in its transaction log can be safely removed. | +| 4.3.2 | The transaction log sends the result of the committed transaction to the transaction coordinator. | +| 4.3.3 | The transaction coordinator sends the result of the committed transaction to the Pulsar client. | diff --git a/site2/website/versioned_docs/version-2.8.x/txn-monitor.md b/site2/website/versioned_docs/version-2.8.x/txn-monitor.md new file mode 100644 index 0000000000000..5b50953772d09 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/txn-monitor.md @@ -0,0 +1,10 @@ +--- +id: txn-monitor +title: How to monitor transactions? +sidebar_label: "How to monitor transactions?" +original_id: txn-monitor +--- + +You can monitor the status of the transactions in Prometheus and Grafana using the [transaction metrics](https://pulsar.apache.org/docs/en/next/reference-metrics/#pulsar-transaction). + +For how to configure Prometheus and Grafana, see [here](https://pulsar.apache.org/docs/en/next/deploy-monitoring). diff --git a/site2/website/versioned_docs/version-2.8.x/txn-use.md b/site2/website/versioned_docs/version-2.8.x/txn-use.md new file mode 100644 index 0000000000000..a16ea7140da76 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/txn-use.md @@ -0,0 +1,105 @@ +--- +id: txn-use +title: How to use transactions? +sidebar_label: "How to use transactions?" +original_id: txn-use +--- + +## Transaction API + +The transaction feature is primarily a server-side and protocol-level feature. You can use the transaction feature via the [transaction API](https://pulsar.apache.org/api/admin/), which is available in **Pulsar 2.8.0 or later**. + +To use the transaction API, you do not need any additional settings in the Pulsar client. **By default**, transactions is **disabled**. + +Currently, transaction API is only available for **Java** clients. Support for other language clients will be added in the future releases. + +## Quick start + +This section provides an example of how to use the transaction API to send and receive messages in a Java client. + +1. Start Pulsar 2.8.0 or later. + +2. Enable transaction. + + Change the configuration in the `broker.conf` file. + + ``` + + transactionCoordinatorEnabled=true + + ``` + + If you want to enable batch messages in transactions, follow the steps below. + + Set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + + ``` + + acknowledgmentAtBatchIndexLevelEnabled=true + + ``` + +3. Initialize transaction coordinator metadata. + + The transaction coordinator can leverage the advantages of partitioned topics (such as load balance). + + **Input** + + ``` + + bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + + ``` + + **Output** + + ``` + + Transaction coordinator metadata setup success + + ``` + +4. Initialize a Pulsar client. + + ``` + + PulsarClient client = PulsarClient.builder() + + .serviceUrl(“pulsar://localhost:6650”) + + .enableTransaction(true) + + .build(); + + ``` + +Now you can start using the transaction API to send and receive messages. Below is an example of a `consume-process-produce` application written in Java. + +![](/assets/txn-9.png) + +Let’s walk through this example step by step. + +| Step | Description | +| --- | --- | +| 1. Start a transaction. | The application opens a new transaction by calling PulsarClient.newTransaction. It specifics the transaction timeout as 1 minute. If the transaction is not committed within 1 minute, the transaction is automatically aborted. | +| 2. Receive messages from topics. | The application creates two normal consumers to receive messages from topic input-topic-1 and input-topic-2 respectively. | +| 3. Publish messages to topics with the transaction. | The application creates two producers to produce the resulting messages to the output topic _output-topic-1_ and output-topic-2 respectively. The application applies the processing logic and generates two output messages. The application sends those two output messages as part of the transaction opened in the first step via Producer.newMessage(Transaction). | +| 4. Acknowledge the messages with the transaction. | In the same transaction, the application acknowledges the two input messages. | +| 5. Commit the transaction. | The application commits the transaction by calling Transaction.commit() on the open transaction. The commit operation ensures the two input messages are marked as acknowledged and the two output messages are written successfully to the output topics. | + +[1] Example of enabling batch messages ack in transactions in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + +.subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.8.x/txn-what.md b/site2/website/versioned_docs/version-2.8.x/txn-what.md new file mode 100644 index 0000000000000..844f19a700f8f --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/txn-what.md @@ -0,0 +1,60 @@ +--- +id: txn-what +title: What are transactions? +sidebar_label: "What are transactions?" +original_id: txn-what +--- + +Transactions strengthen the message delivery semantics of Apache Pulsar and [processing guarantees of Pulsar Functions](https://pulsar.apache.org/docs/en/next/functions-overview/#processing-guarantees). The Pulsar Transaction API supports atomic writes and acknowledgments across multiple topics. + +Transactions allow: + +- A producer to send a batch of messages to multiple topics where all messages in the batch are eventually visible to any consumer, or none are ever visible to consumers. + +- End-to-end exactly-once semantics (execute a `consume-process-produce` operation exactly once). + +## Transaction semantics + +Pulsar transactions have the following semantics: + +* All operations within a transaction are committed as a single unit. + + * Either all messages are committed, or none of them are. + + * Each message is written or processed exactly once, without data loss or duplicates (even in the event of failures). + + * If a transaction is aborted, all the writes and acknowledgments in this transaction rollback. + +* A group of messages in a transaction can be received from, produced to, and acknowledged by multiple partitions. + + * Consumers are only allowed to read committed (acked) messages. In other words, the broker does not deliver transactional messages which are part of an open transaction or messages which are part of an aborted transaction. + + * Message writes across multiple partitions are atomic. + + * Message acks across multiple subscriptions are atomic. A message is acked successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. + +## Transactions and stream processing + +Stream processing on Pulsar is a `consume-process-produce` operation on Pulsar topics: + +* `Consume`: a source operator that runs a Pulsar consumer reads messages from one or multiple Pulsar topics. + +* `Process`: a processing operator transforms the messages. + +* `Produce`: a sink operator that runs a Pulsar producer writes the resulting messages to one or multiple Pulsar topics. + +![](/assets/txn-2.png) + +Pulsar transactions support end-to-end exactly-once stream processing, which means messages are not lost from a source operator and messages are not duplicated to a sink operator. + +## Use case + +Prior to Pulsar 2.8.0, there was no easy way to build stream processing applications with Pulsar to achieve exactly-once processing guarantees. With the transaction introduced in Pulsar 2.8.0, the following services support exactly-once semantics: + +* [Pulsar Flink connector](https://flink.apache.org/2021/01/07/pulsar-flink-connector-270.html) + + Prior to Pulsar 2.8.0, if you want to build stream applications using Pulsar and Flink, the Pulsar Flink connector only supported exactly-once source connector and at-least-once sink connector, which means the highest processing guarantee for end-to-end was at-least-once, there was possibility that the resulting messages from streaming applications produce duplicated messages to the resulting topics in Pulsar. + + With the transaction introduced in Pulsar 2.8.0, the Pulsar Flink sink connector can support exactly-once semantics by implementing the designated `TwoPhaseCommitSinkFunction` and hooking up the Flink sink message lifecycle with Pulsar transaction API. + +* Support for Pulsar Functions and other connectors will be added in the future releases. diff --git a/site2/website/versioned_docs/version-2.8.x/txn-why.md b/site2/website/versioned_docs/version-2.8.x/txn-why.md new file mode 100644 index 0000000000000..1ed8769977654 --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/txn-why.md @@ -0,0 +1,45 @@ +--- +id: txn-why +title: Why transactions? +sidebar_label: "Why transactions?" +original_id: txn-why +--- + +Pulsar transactions (txn) enable event streaming applications to consume, process, and produce messages in one atomic operation. The reason for developing this feature can be summarized as below. + +## Demand of stream processing + +The demand for stream processing applications with stronger processing guarantees has grown along with the rise of stream processing. For example, in the financial industry, financial institutions use stream processing engines to process debits and credits for users. This type of use case requires that every message is processed exactly once, without exception. + +In other words, if a stream processing application consumes message A and +produces the result as a message B (B = f(A)), then exactly-once processing +guarantee means that A can only be marked as consumed if and only if B is +successfully produced, and vice versa. + +![](/assets/txn-1.png) + +The Pulsar transactions API strengthens the message delivery semantics and the processing guarantees for stream processing. It enables stream processing applications to consume, process, and produce messages in one atomic operation. That means, a batch of messages in a transaction can be received from, produced to and acknowledged by many topic partitions. All the operations involved in a transaction succeed or fail as one single until. + +## Limitation of idempotent producer + +Avoiding data loss or duplication can be achieved by using the Pulsar idempotent producer, but it does not provide guarantees for writes across multiple partitions. + +In Pulsar, the highest level of message delivery guarantee is using an [idempotent producer](https://pulsar.apache.org/docs/en/next/concepts-messaging/#producer-idempotency) with the exactly once semantic at one single partition, that is, each message is persisted exactly once without data loss and duplication. However, there are some limitations in this solution: + +- Due to the monotonic increasing sequence ID, this solution only works on a single partition and within a single producer session (that is, for producing one message), so there is no atomicity when producing multiple messages to one or multiple partitions. + + In this case, if there are some failures (for example, client / broker / bookie crashes, network failure, and more) in the process of producing and receiving messages, messages are re-processed and re-delivered, which may cause data loss or data duplication: + + - For the producer: if the producer retry sending messages, some messages are persisted multiple times; if the producer does not retry sending messages, some messages are persisted once and other messages are lost. + + - For the consumer: since the consumer does not know whether the broker has received messages or not, the consumer may not retry sending acks, which causes it to receive duplicate messages. + +- Similarly, for Pulsar Function, it only guarantees exactly once semantics for an idempotent function on a single event rather than processing multiple events or producing multiple results that can happen exactly. + + For example, if a function accepts multiple events and produces one result (for example, window function), the function may fail between producing the result and acknowledging the incoming messages, or even between acknowledging individual events, which causes all (or some) incoming messages to be re-delivered and reprocessed, and a new result is generated. + + However, many scenarios need atomic guarantees across multiple partitions and sessions. + +- Consumers need to rely on more mechanisms to acknowledge (ack) messages once. + + For example, consumers are required to store the MessageID along with its acked state. After the topic is unloaded, the subscription can recover the acked state of this MessageID in memory when the topic is loaded again. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.8.x/window-functions-context.md b/site2/website/versioned_docs/version-2.8.x/window-functions-context.md new file mode 100644 index 0000000000000..f80fea57989ef --- /dev/null +++ b/site2/website/versioned_docs/version-2.8.x/window-functions-context.md @@ -0,0 +1,581 @@ +--- +id: window-functions-context +title: Window Functions Context +sidebar_label: "Window Functions: Context" +original_id: window-functions-context +--- + +Java SDK provides access to a **window context object** that can be used by a window function. This context object provides a wide variety of information and functionality for Pulsar window functions as below. + +- [Spec](#spec) + + * Names of all input topics and the output topic associated with the function. + * Tenant and namespace associated with the function. + * Pulsar window function name, ID, and version. + * ID of the Pulsar function instance running the window function. + * Number of instances that invoke the window function. + * Built-in type or custom class name of the output schema. + +- [Logger](#logger) + + * Logger object used by the window function, which can be used to create window function log messages. + +- [User config](#user-config) + + * Access to arbitrary user configuration values. + +- [Routing](#routing) + + * Routing is supported in Pulsar window functions. Pulsar window functions send messages to arbitrary topics as per the `publish` interface. + +- [Metrics](#metrics) + + * Interface for recording metrics. + +- [State storage](#state-storage) + + * Interface for storing and retrieving state in [state storage](#state-storage). + +## Spec + +Spec contains the basic information of a function. + +### Get input topics + +The `getInputTopics` method gets the **name list** of all input topics. + +This example demonstrates how to get the name list of all input topics in a Java window function. + +```java + +public class GetInputTopicsWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Collection inputTopics = context.getInputTopics(); + System.out.println(inputTopics); + + return null; + } + +} + +``` + +### Get output topic + +The `getOutputTopic` method gets the **name of a topic** to which the message is sent. + +This example demonstrates how to get the name of an output topic in a Java window function. + +```java + +public class GetOutputTopicWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String outputTopic = context.getOutputTopic(); + System.out.println(outputTopic); + + return null; + } +} + +``` + +### Get tenant + +The `getTenant` method gets the tenant name associated with the window function. + +This example demonstrates how to get the tenant name in a Java window function. + +```java + +public class GetTenantWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String tenant = context.getTenant(); + System.out.println(tenant); + + return null; + } + +} + +``` + +### Get namespace + +The `getNamespace` method gets the namespace associated with the window function. + +This example demonstrates how to get the namespace in a Java window function. + +```java + +public class GetNamespaceWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String ns = context.getNamespace(); + System.out.println(ns); + + return null; + } + +} + +``` + +### Get function name + +The `getFunctionName` method gets the window function name. + +This example demonstrates how to get the function name in a Java window function. + +```java + +public class GetNameOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionName = context.getFunctionName(); + System.out.println(functionName); + + return null; + } + +} + +``` + +### Get function ID + +The `getFunctionId` method gets the window function ID. + +This example demonstrates how to get the function ID in a Java window function. + +```java + +public class GetFunctionIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionID = context.getFunctionId(); + System.out.println(functionID); + + return null; + } + +} + +``` + +### Get function version + +The `getFunctionVersion` method gets the window function version. + +This example demonstrates how to get the function version of a Java window function. + +```java + +public class GetVersionOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionVersion = context.getFunctionVersion(); + System.out.println(functionVersion); + + return null; + } + +} + +``` + +### Get instance ID + +The `getInstanceId` method gets the instance ID of a window function. + +This example demonstrates how to get the instance ID in a Java window function. + +```java + +public class GetInstanceIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int instanceId = context.getInstanceId(); + System.out.println(instanceId); + + return null; + } + +} + +``` + +### Get num instances + +The `getNumInstances` method gets the number of instances that invoke the window function. + +This example demonstrates how to get the number of instances in a Java window function. + +```java + +public class GetNumInstancesWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int numInstances = context.getNumInstances(); + System.out.println(numInstances); + + return null; + } + +} + +``` + +### Get output schema type + +The `getOutputSchemaType` method gets the built-in type or custom class name of the output schema. + +This example demonstrates how to get the output schema type of a Java window function. + +```java + +public class GetOutputSchemaTypeWindowFunction implements WindowFunction { + + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String schemaType = context.getOutputSchemaType(); + System.out.println(schemaType); + + return null; + } +} + +``` + +## Logger + +Pulsar window functions using Java SDK has access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. + +This example logs either a `WARNING`-level or `INFO`-level log based on whether the incoming string contains the word `danger` or not in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; +import org.slf4j.Logger; + +public class LoggingWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Logger log = context.getLogger(); + for (Record record : inputs) { + log.info(record + "-window-log"); + } + return null; + } + +} + +``` + +If you need your function to produce logs, specify a log topic when creating or running the function. + +```bash + +bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +You can access all logs produced by `LoggingFunction` via the `persistent://public/default/logging-function-logs` topic. + +## Metrics + +Pulsar window functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar window function uses the language-native interface for Java, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can record metrics using the context object on a per-key basis. + +This example sets a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; + + +/** + * Example function that wants to keep track of + * the event time of each message sent. + */ +public class UserMetricWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + + for (Record record : inputs) { + if (record.getEventTime().isPresent()) { + context.recordMetric("MessageEventTime", record.getEventTime().get().doubleValue()); + } + } + + return null; + } +} + +``` + +## User config + +When you run or update Pulsar Functions that are created using SDK, you can pass arbitrary key/value pairs to them with the `--user-config` flag. Key/value pairs **must** be specified as JSON. + +This example passes a user configured key/value to a function. + +```bash + +bin/pulsar-admin functions create \ + --name word-filter \ + --user-config '{"forbidden-word":"rosebud"}' \ + # Other function configs + +``` + +### API +You can use the following APIs to get user-defined information for window functions. +#### getUserConfigMap + +`getUserConfigMap` API gets a map of all user-defined key/value configurations for the window function. + +```java + +/** + * Get a map of all user-defined key/value configs for the function. + * + * @return The full map of user-defined config values + */ + Map getUserConfigMap(); + +``` + +#### getUserConfigValue + +The `getUserConfigValue` API gets a user-defined key/value. + +```java + +/** + * Get any user-defined key/value. + * + * @param key The key + * @return The Optional value specified by the user for that key. + */ + Optional getUserConfigValue(String key); + +``` + +#### getUserConfigValueOrDefault + +The `getUserConfigValueOrDefault` API gets a user-defined key/value or a default value if none is present. + +```java + +/** + * Get any user-defined key/value or a default value if none is present. + * + * @param key + * @param defaultValue + * @return Either the user config value associated with a given key or a supplied default value + */ + Object getUserConfigValueOrDefault(String key, Object defaultValue); + +``` + +This example demonstrates how to access key/value pairs provided to Pulsar window functions. + +Java SDK context object enables you to access key/value pairs provided to Pulsar window functions via the command line (as JSON). + +:::tip + +For all key/value pairs passed to Java window functions, both the `key` and the `value` are `String`. To set the value to be a different type, you need to deserialize it from the `String` type. + +::: + +This example passes a key/value pair in a Java window function. + +```bash + +bin/pulsar-admin functions create \ + --user-config '{"word-of-the-day":"verdure"}' \ + # Other function configs + +``` + +This example accesses values in a Java window function. + +The `UserConfigFunction` function logs the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The user config of `word-of-the-day` is changed **only** when the function is updated with a new config value via +multiple ways, such as the command line tool or REST API. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigWindowFunction implements WindowFunction { + @Override + public String process(Collection> input, WindowContext context) throws Exception { + Optional whatToWrite = context.getUserConfigValue("WhatToWrite"); + if (whatToWrite.get() != null) { + return (String)whatToWrite.get(); + } else { + return "Not a nice way"; + } + } + +} + +``` + +If no value is provided, you can access the entire user config map or set a default value. + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +## Routing + +You can use the `context.publish()` interface to publish as many results as you want. + +This example shows that the `PublishFunction` class uses the built-in function in the context to publish messages to the `publishTopic` in a Java function. + +```java + +public class PublishWindowFunction implements WindowFunction { + @Override + public Void process(Collection> input, WindowContext context) throws Exception { + String publishTopic = (String) context.getUserConfigValueOrDefault("publish-topic", "publishtopic"); + String output = String.format("%s!", input); + context.publish(publishTopic, output); + + return null; + } + +} + +``` + +## State storage + +Pulsar window functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Apache Pulsar installation (including the standalone installation) includes the deployment of BookKeeper bookies. + +Apache Pulsar integrates with Apache BookKeeper `table service` to store the `state` for functions. For example, the `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions state APIs. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data—counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function and shared between instances of that function. + +Currently, Pulsar window functions expose Java API to access, update, and manage states. These APIs are available in the context object when you use Java SDK functions. + +| Java API| Description +|---|--- +|`incrCounter`|Increases a built-in distributed counter referred by key. +|`getCounter`|Gets the counter value for the key. +|`putState`|Updates the state value for the key. + +You can use the following APIs to access, update, and manage states in Java window functions. + +#### incrCounter + +The `incrCounter` API increases a built-in distributed counter referred by key. + +Applications use the `incrCounter` API to change the counter of a given `key` by the given `amount`. If the `key` does not exist, a new key is created. + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +#### getCounter + +The `getCounter` API gets the counter value for the key. + +Applications uses the `getCounter` API to retrieve the counter of a given `key` changed by the `incrCounter` API. + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +Except the `getCounter` API, Pulsar also exposes a general key/value API (`putState`) for functions to store general key/value state. + +#### putState + +The `putState` API updates the state value for the key. + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +This example demonstrates how applications store states in Pulsar window functions. + +The logic of the `WordCountWindowFunction` is simple and straightforward. + +1. The function first splits the received string into multiple words using regex `\\.`. + +2. For each `word`, the function increments the corresponding `counter` by 1 via `incrCounter(key, amount)`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + for (Record input : inputs) { + Arrays.asList(input.getValue().split("\\.")).forEach(word -> context.incrCounter(word, 1)); + } + return null; + + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/about.md b/site2/website/versioned_docs/version-2.9.x/about.md new file mode 100644 index 0000000000000..b3b98832a0559 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/about.md @@ -0,0 +1,56 @@ +--- +slug: / +id: about +title: Welcome to the doc portal! +sidebar_label: "About" +--- + +import BlockLinks from "@site/src/components/BlockLinks"; +import BlockLink from "@site/src/components/BlockLink"; +import { docUrl } from "@site/src/utils/index"; + + +# Welcome to the doc portal! +*** + +This portal holds a variety of support documents to help you work with Pulsar . If you’re a beginner, there are tutorials and explainers to help you understand Pulsar and how it works. + +If you’re an experienced coder, review this page to learn the easiest way to access the specific content you’re looking for. + +## Get Started Now + + + + + + + + + +## Navigation +*** + +There are several ways to get around in the doc portal. The index navigation pane is a table of contents for the entire archive. The archive is divided into sections, like chapters in a book. Click the title of the topic to view it. + +In-context links provide an easy way to immediately reference related topics. Click the underlined term to view the topic. + +Links to related topics can be found at the bottom of each topic page. Click the link to view the topic. + +![Page Linking](/assets/page-linking.png) + +## Continuous Improvement +*** +As you probably know, we are working on a new user experience for our documentation portal that will make learning about and building on top of Apache Pulsar a much better experience. Whether you need overview concepts, how-to procedures, curated guides or quick references, we’re building content to support it. This welcome page is just the first step. We will be providing updates every month. + +## Help Improve These Documents +*** + +You’ll notice an Edit button at the bottom and top of each page. Click it to open a landing page with instructions for requesting changes to posted documents. These are your resources. Participation is not only welcomed – it’s essential! + +## Join the Community! +*** + +The Pulsar community on github is active, passionate, and knowledgeable. Join discussions, voice opinions, suggest features, and dive into the code itself. Find your Pulsar family here at [apache/pulsar](https://github.com/apache/pulsar). + +An equally passionate community can be found in the [Pulsar Slack channel](https://apache-pulsar.slack.com/). You’ll need an invitation to join, but many Github Pulsar community members are Slack members too. Join, hang out, learn, and make some new friends. + diff --git a/site2/website/versioned_docs/version-2.9.x/adaptors-kafka.md b/site2/website/versioned_docs/version-2.9.x/adaptors-kafka.md new file mode 100644 index 0000000000000..ea256049710fd --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/adaptors-kafka.md @@ -0,0 +1,276 @@ +--- +id: adaptors-kafka +title: Pulsar adaptor for Apache Kafka +sidebar_label: "Kafka client wrapper" +original_id: adaptors-kafka +--- + + +Pulsar provides an easy option for applications that are currently written using the [Apache Kafka](http://kafka.apache.org) Java client API. + +## Using the Pulsar Kafka compatibility wrapper + +In an existing application, change the regular Kafka client dependency and replace it with the Pulsar Kafka wrapper. Remove the following dependency in `pom.xml`: + +```xml + + + org.apache.kafka + kafka-clients + 0.10.2.1 + + +``` + +Then include this dependency for the Pulsar Kafka wrapper: + +```xml + + + org.apache.pulsar + pulsar-client-kafka + @pulsar:version@ + + +``` + +With the new dependency, the existing code works without any changes. You need to adjust the configuration, and make sure it points the +producers and consumers to Pulsar service rather than Kafka, and uses a particular +Pulsar topic. + +## Using the Pulsar Kafka compatibility wrapper together with existing kafka client + +When migrating from Kafka to Pulsar, the application might use the original kafka client +and the pulsar kafka wrapper together during migration. You should consider using the +unshaded pulsar kafka client wrapper. + +```xml + + + org.apache.pulsar + pulsar-client-kafka-original + @pulsar:version@ + + +``` + +When using this dependency, construct producers using `org.apache.kafka.clients.producer.PulsarKafkaProducer` +instead of `org.apache.kafka.clients.producer.KafkaProducer` and `org.apache.kafka.clients.producer.PulsarKafkaConsumer` for consumers. + +## Producer example + +```java + +// Topic needs to be a regular Pulsar topic +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); + +props.put("key.serializer", IntegerSerializer.class.getName()); +props.put("value.serializer", StringSerializer.class.getName()); + +Producer producer = new KafkaProducer(props); + +for (int i = 0; i < 10; i++) { + producer.send(new ProducerRecord(topic, i, "hello-" + i)); + log.info("Message {} sent successfully", i); +} + +producer.close(); + +``` + +## Consumer example + +```java + +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); +props.put("group.id", "my-subscription-name"); +props.put("enable.auto.commit", "false"); +props.put("key.deserializer", IntegerDeserializer.class.getName()); +props.put("value.deserializer", StringDeserializer.class.getName()); + +Consumer consumer = new KafkaConsumer(props); +consumer.subscribe(Arrays.asList(topic)); + +while (true) { + ConsumerRecords records = consumer.poll(100); + records.forEach(record -> { + log.info("Received record: {}", record); + }); + + // Commit last offset + consumer.commitSync(); +} + +``` + +## Complete Examples + +You can find the complete producer and consumer examples [here](https://github.com/apache/pulsar-adapters/tree/master/pulsar-client-kafka-compat/pulsar-client-kafka-tests/src/test/java/org/apache/pulsar/client/kafka/compat/examples). + +## Compatibility matrix + +Currently the Pulsar Kafka wrapper supports most of the operations offered by the Kafka API. + +### Producer + +APIs: + +| Producer Method | Supported | Notes | +|:------------------------------------------------------------------------------|:----------|:-------------------------------------------------------------------------| +| `Future send(ProducerRecord record)` | Yes | | +| `Future send(ProducerRecord record, Callback callback)` | Yes | | +| `void flush()` | Yes | | +| `List partitionsFor(String topic)` | No | | +| `Map metrics()` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | + +Properties: + +| Config property | Supported | Notes | +|:----------------------------------------|:----------|:------------------------------------------------------------------------------| +| `acks` | Ignored | Durability and quorum writes are configured at the namespace level | +| `auto.offset.reset` | Yes | It uses a default value of `earliest` if you do not give a specific setting. | +| `batch.size` | Ignored | | +| `bootstrap.servers` | Yes | | +| `buffer.memory` | Ignored | | +| `client.id` | Ignored | | +| `compression.type` | Yes | Allows `gzip` and `lz4`. No `snappy`. | +| `connections.max.idle.ms` | Yes | Only support up to 2,147,483,647,000(Integer.MAX_VALUE * 1000) ms of idle time| +| `interceptor.classes` | Yes | | +| `key.serializer` | Yes | | +| `linger.ms` | Yes | Controls the group commit time when batching messages | +| `max.block.ms` | Ignored | | +| `max.in.flight.requests.per.connection` | Ignored | In Pulsar ordering is maintained even with multiple requests in flight | +| `max.request.size` | Ignored | | +| `metric.reporters` | Ignored | | +| `metrics.num.samples` | Ignored | | +| `metrics.sample.window.ms` | Ignored | | +| `partitioner.class` | Yes | | +| `receive.buffer.bytes` | Ignored | | +| `reconnect.backoff.ms` | Ignored | | +| `request.timeout.ms` | Ignored | | +| `retries` | Ignored | Pulsar client retries with exponential backoff until the send timeout expires. | +| `send.buffer.bytes` | Ignored | | +| `timeout.ms` | Yes | | +| `value.serializer` | Yes | | + + +### Consumer + +The following table lists consumer APIs. + +| Consumer Method | Supported | Notes | +|:--------------------------------------------------------------------------------------------------------|:----------|:------| +| `Set assignment()` | No | | +| `Set subscription()` | Yes | | +| `void subscribe(Collection topics)` | Yes | | +| `void subscribe(Collection topics, ConsumerRebalanceListener callback)` | No | | +| `void assign(Collection partitions)` | No | | +| `void subscribe(Pattern pattern, ConsumerRebalanceListener callback)` | No | | +| `void unsubscribe()` | Yes | | +| `ConsumerRecords poll(long timeoutMillis)` | Yes | | +| `void commitSync()` | Yes | | +| `void commitSync(Map offsets)` | Yes | | +| `void commitAsync()` | Yes | | +| `void commitAsync(OffsetCommitCallback callback)` | Yes | | +| `void commitAsync(Map offsets, OffsetCommitCallback callback)` | Yes | | +| `void seek(TopicPartition partition, long offset)` | Yes | | +| `void seekToBeginning(Collection partitions)` | Yes | | +| `void seekToEnd(Collection partitions)` | Yes | | +| `long position(TopicPartition partition)` | Yes | | +| `OffsetAndMetadata committed(TopicPartition partition)` | Yes | | +| `Map metrics()` | No | | +| `List partitionsFor(String topic)` | No | | +| `Map> listTopics()` | No | | +| `Set paused()` | No | | +| `void pause(Collection partitions)` | No | | +| `void resume(Collection partitions)` | No | | +| `Map offsetsForTimes(Map timestampsToSearch)` | No | | +| `Map beginningOffsets(Collection partitions)` | No | | +| `Map endOffsets(Collection partitions)` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | +| `void wakeup()` | No | | + +Properties: + +| Config property | Supported | Notes | +|:--------------------------------|:----------|:------------------------------------------------------| +| `group.id` | Yes | Maps to a Pulsar subscription name | +| `max.poll.records` | Yes | | +| `max.poll.interval.ms` | Ignored | Messages are "pushed" from broker | +| `session.timeout.ms` | Ignored | | +| `heartbeat.interval.ms` | Ignored | | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `enable.auto.commit` | Yes | | +| `auto.commit.interval.ms` | Ignored | With auto-commit, acks are sent immediately to broker | +| `partition.assignment.strategy` | Ignored | | +| `auto.offset.reset` | Yes | Only support earliest and latest. | +| `fetch.min.bytes` | Ignored | | +| `fetch.max.bytes` | Ignored | | +| `fetch.max.wait.ms` | Ignored | | +| `interceptor.classes` | Yes | | +| `metadata.max.age.ms` | Ignored | | +| `max.partition.fetch.bytes` | Ignored | | +| `send.buffer.bytes` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `client.id` | Ignored | | + + +## Customize Pulsar configurations + +You can configure Pulsar authentication provider directly from the Kafka properties. + +### Pulsar client properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.authentication.class`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-org.apache.pulsar.client.api.Authentication-) | | Configure to auth provider. For example, `org.apache.pulsar.client.impl.auth.AuthenticationTls`.| +| [`pulsar.authentication.params.map`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.util.Map-) | | Map which represents parameters for the Authentication-Plugin. | +| [`pulsar.authentication.params.string`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.lang.String-) | | String which represents parameters for the Authentication-Plugin, for example, `key1:val1,key2:val2`. | +| [`pulsar.use.tls`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTls-boolean-) | `false` | Enable TLS transport encryption. | +| [`pulsar.tls.trust.certs.file.path`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsTrustCertsFilePath-java.lang.String-) | | Path for the TLS trust certificate store. | +| [`pulsar.tls.allow.insecure.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsAllowInsecureConnection-boolean-) | `false` | Accept self-signed certificates from brokers. | +| [`pulsar.operation.timeout.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setOperationTimeout-int-java.util.concurrent.TimeUnit-) | `30000` | General operations timeout. | +| [`pulsar.stats.interval.seconds`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setStatsInterval-long-java.util.concurrent.TimeUnit-) | `60` | Pulsar client lib stats printing interval. | +| [`pulsar.num.io.threads`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setIoThreads-int-) | `1` | The number of Netty IO threads to use. | +| [`pulsar.connections.per.broker`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConnectionsPerBroker-int-) | `1` | The maximum number of connection to each broker. | +| [`pulsar.use.tcp.nodelay`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTcpNoDelay-boolean-) | `true` | TCP no-delay. | +| [`pulsar.concurrent.lookup.requests`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConcurrentLookupRequest-int-) | `50000` | The maximum number of concurrent topic lookups. | +| [`pulsar.max.number.rejected.request.per.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setMaxNumberOfRejectedRequestPerConnection-int-) | `50` | The threshold of errors to forcefully close a connection. | +| [`pulsar.keepalive.interval.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientBuilder.html#keepAliveInterval-int-java.util.concurrent.TimeUnit-)| `30000` | Keep alive interval for each client-broker-connection. | + + +### Pulsar producer properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.producer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setProducerName-java.lang.String-) | | Specify the producer name. | +| [`pulsar.producer.initial.sequence.id`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setInitialSequenceId-long-) | | Specify baseline for sequence ID of this producer. | +| [`pulsar.producer.max.pending.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessages-int-) | `1000` | Set the maximum size of the message queue pending to receive an acknowledgment from the broker. | +| [`pulsar.producer.max.pending.messages.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessagesAcrossPartitions-int-) | `50000` | Set the maximum number of pending messages across all the partitions. | +| [`pulsar.producer.batching.enabled`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingEnabled-boolean-) | `true` | Control whether automatic batching of messages is enabled for the producer. | +| [`pulsar.producer.batching.max.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingMaxMessages-int-) | `1000` | The maximum number of messages in a batch. | +| [`pulsar.block.if.producer.queue.full`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBlockIfQueueFull-boolean-) | | Specify the block producer if queue is full. | +| [`pulsar.crypto.reader.factory.class.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setCryptoKeyReader-org.apache.pulsar.client.api.CryptoKeyReader-) | | Specify the CryptoReader-Factory(`CryptoKeyReaderFactory`) classname which allows producer to create CryptoKeyReader. | + + +### Pulsar consumer Properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.consumer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setConsumerName-java.lang.String-) | | Specify the consumer name. | +| [`pulsar.consumer.receiver.queue.size`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setReceiverQueueSize-int-) | 1000 | Set the size of the consumer receiver queue. | +| [`pulsar.consumer.acknowledgments.group.time.millis`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#acknowledgmentGroupTime-long-java.util.concurrent.TimeUnit-) | 100 | Set the maximum amount of group time for consumers to send the acknowledgments to the broker. | +| [`pulsar.consumer.total.receiver.queue.size.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setMaxTotalReceiverQueueSizeAcrossPartitions-int-) | 50000 | Set the maximum size of the total receiver queue across partitions. | +| [`pulsar.consumer.subscription.topics.mode`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#subscriptionTopicsMode-Mode-) | PersistentOnly | Set the subscription topic mode for consumers. | +| [`pulsar.crypto.reader.factory.class.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setCryptoKeyReader-org.apache.pulsar.client.api.CryptoKeyReader-) | | Specify the CryptoReader-Factory(`CryptoKeyReaderFactory`) classname which allows consumer to create CryptoKeyReader. | diff --git a/site2/website/versioned_docs/version-2.9.x/adaptors-spark.md b/site2/website/versioned_docs/version-2.9.x/adaptors-spark.md new file mode 100644 index 0000000000000..e14f13b5d4b07 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/adaptors-spark.md @@ -0,0 +1,91 @@ +--- +id: adaptors-spark +title: Pulsar adaptor for Apache Spark +sidebar_label: "Apache Spark" +original_id: adaptors-spark +--- + +## Spark Streaming receiver +The Spark Streaming receiver for Pulsar is a custom receiver that enables Apache [Spark Streaming](https://spark.apache.org/streaming/) to receive raw data from Pulsar. + +An application can receive data in [Resilient Distributed Dataset](https://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds) (RDD) format via the Spark Streaming receiver and can process it in a variety of ways. + +### Prerequisites + +To use the receiver, include a dependency for the `pulsar-spark` library in your Java configuration. + +#### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-spark + ${pulsar.version} + + +``` + +#### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy + +def pulsarVersion = "@pulsar:version@" + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-spark', version: pulsarVersion +} + +``` + +### Usage + +Pass an instance of `SparkStreamingPulsarReceiver` to the `receiverStream` method in `JavaStreamingContext`: + +```java + + String serviceUrl = "pulsar://localhost:6650/"; + String topic = "persistent://public/default/test_src"; + String subs = "test_sub"; + + SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("Pulsar Spark Example"); + + JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(60)); + + ConsumerConfigurationData pulsarConf = new ConsumerConfigurationData(); + + Set set = new HashSet(); + set.add(topic); + pulsarConf.setTopicNames(set); + pulsarConf.setSubscriptionName(subs); + + SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationDisabled()); + + JavaReceiverInputDStream lineDStream = jsc.receiverStream(pulsarReceiver); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/examples/spark/src/main/java/org/apache/spark/streaming/receiver/example/SparkStreamingPulsarReceiverExample.java). In this example, the number of messages that contain the string "Pulsar" in received messages is counted. + +Note that if needed, other Pulsar authentication classes can be used. For example, in order to use a token during authentication the following parameters for the `SparkStreamingPulsarReceiver` constructor can be set: + +```java + +SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( + serviceUrl, + pulsarConf, + new AuthenticationToken("token:")); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/adaptors-storm.md b/site2/website/versioned_docs/version-2.9.x/adaptors-storm.md new file mode 100644 index 0000000000000..76d507164777d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/adaptors-storm.md @@ -0,0 +1,96 @@ +--- +id: adaptors-storm +title: Pulsar adaptor for Apache Storm +sidebar_label: "Apache Storm" +original_id: adaptors-storm +--- + +Pulsar Storm is an adaptor for integrating with [Apache Storm](http://storm.apache.org/) topologies. It provides core Storm implementations for sending and receiving data. + +An application can inject data into a Storm topology via a generic Pulsar spout, as well as consume data from a Storm topology via a generic Pulsar bolt. + +## Using the Pulsar Storm Adaptor + +Include dependency for Pulsar Storm Adaptor: + +```xml + + + org.apache.pulsar + pulsar-storm + ${pulsar.version} + + +``` + +## Pulsar Spout + +The Pulsar Spout allows for the data published on a topic to be consumed by a Storm topology. It emits a Storm tuple based on the message received and the `MessageToValuesMapper` provided by the client. + +The tuples that fail to be processed by the downstream bolts will be re-injected by the spout with an exponential backoff, within a configurable timeout (the default is 60 seconds) or a configurable number of retries, whichever comes first, after which it is acknowledged by the consumer. Here's an example construction of a spout: + +```java + +MessageToValuesMapper messageToValuesMapper = new MessageToValuesMapper() { + + @Override + public Values toValues(Message msg) { + return new Values(new String(msg.getData())); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + declarer.declare(new Fields("string")); + } +}; + +// Configure a Pulsar Spout +PulsarSpoutConfiguration spoutConf = new PulsarSpoutConfiguration(); +spoutConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +spoutConf.setTopic("persistent://my-property/usw/my-ns/my-topic1"); +spoutConf.setSubscriptionName("my-subscriber-name1"); +spoutConf.setMessageToValuesMapper(messageToValuesMapper); + +// Create a Pulsar Spout +PulsarSpout spout = new PulsarSpout(spoutConf); + +``` + +For a complete example, click [here](https://github.com/apache/pulsar-adapters/blob/master/pulsar-storm/src/test/java/org/apache/pulsar/storm/PulsarSpoutTest.java). + +## Pulsar Bolt + +The Pulsar bolt allows data in a Storm topology to be published on a topic. It publishes messages based on the Storm tuple received and the `TupleToMessageMapper` provided by the client. + +A partitioned topic can also be used to publish messages on different topics. In the implementation of the `TupleToMessageMapper`, a "key" will need to be provided in the message which will send the messages with the same key to the same topic. Here's an example bolt: + +```java + +TupleToMessageMapper tupleToMessageMapper = new TupleToMessageMapper() { + + @Override + public TypedMessageBuilder toMessage(TypedMessageBuilder msgBuilder, Tuple tuple) { + String receivedMessage = tuple.getString(0); + // message processing + String processedMsg = receivedMessage + "-processed"; + return msgBuilder.value(processedMsg.getBytes()); + } + + @Override + public void declareOutputFields(OutputFieldsDeclarer declarer) { + // declare the output fields + } +}; + +// Configure a Pulsar Bolt +PulsarBoltConfiguration boltConf = new PulsarBoltConfiguration(); +boltConf.setServiceUrl("pulsar://broker.messaging.usw.example.com:6650"); +boltConf.setTopic("persistent://my-property/usw/my-ns/my-topic2"); +boltConf.setTupleToMessageMapper(tupleToMessageMapper); + +// Create a Pulsar Bolt +PulsarBolt bolt = new PulsarBolt(boltConf); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-brokers.md b/site2/website/versioned_docs/version-2.9.x/admin-api-brokers.md new file mode 100644 index 0000000000000..930fe69ecfb0e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-brokers.md @@ -0,0 +1,286 @@ +--- +id: admin-api-brokers +title: Managing Brokers +sidebar_label: "Brokers" +original_id: admin-api-brokers +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Pulsar brokers consist of two components: + +1. An HTTP server exposing a {@inject: rest:REST:/} interface administration and [topic](reference-terminology.md#topic) lookup. +2. A dispatcher that handles all Pulsar [message](reference-terminology.md#message) transfers. + +[Brokers](reference-terminology.md#broker) can be managed via: + +* The `brokers` command of the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool +* The `/admin/v2/brokers` endpoint of the admin {@inject: rest:REST:/} API +* The `brokers` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +In addition to being configurable when you start them up, brokers can also be [dynamically configured](#dynamic-broker-configuration). + +> See the [Configuration](reference-configuration.md#broker) page for a full listing of broker-specific configuration parameters. + +## Brokers resources + +### List active brokers + +Fetch all available active brokers that are serving traffic with cluster name. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list use + +``` + +``` + +broker1.use.org.com:8080 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster|operation/getActiveBrokers?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getActiveBrokers(clusterName) + +``` + + + + +```` + +### Get the information of the leader broker + +Fetch the information of the leader broker, for example, the service url. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers leader-broker + +``` + +``` + +BrokerInfo(serviceUrl=broker1.use.org.com:8080) + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/leaderBroker|operation/getLeaderBroker?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getLeaderBroker() + +``` + +For the detail of the code above, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/BrokersImpl.java#L80) + + + + +```` + +#### list of namespaces owned by a given broker + +It finds all namespaces which are owned and served by a given broker. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers namespaces use \ + --url broker1.use.org.com:8080 + +``` + +```json + +{ + "my-property/use/my-ns/0x00000000_0xffffffff": { + "broker_assignment": "shared", + "is_controlled": false, + "is_active": true + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/:cluster/:broker/ownedNamespaces|operation/getOwnedNamespaes?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getOwnedNamespaces(cluster,brokerUrl); + +``` + + + + +```` + +### Dynamic broker configuration + +One way to configure a Pulsar [broker](reference-terminology.md#broker) is to supply a [configuration](reference-configuration.md#broker) when the broker is [started up](reference-cli-tools.md#pulsar-broker). + +But since all broker configuration in Pulsar is stored in ZooKeeper, configuration values can also be dynamically updated *while the broker is running*. When you update broker configuration dynamically, ZooKeeper will notify the broker of the change and the broker will then override any existing configuration values. + +* The [`brokers`](reference-pulsar-admin.md#brokers) command for the [`pulsar-admin`](reference-pulsar-admin.md) tool has a variety of subcommands that enable you to manipulate a broker's configuration dynamically, enabling you to [update config values](#update-dynamic-configuration) and more. +* In the Pulsar admin {@inject: rest:REST:/} API, dynamic configuration is managed through the `/admin/v2/brokers/configuration` endpoint. + +### Update dynamic configuration + +````mdx-code-block + + + +The [`update-dynamic-config`](reference-pulsar-admin.md#brokers-update-dynamic-config) subcommand will update existing configuration. It takes two arguments: the name of the parameter and the new value using the `config` and `value` flag respectively. Here's an example for the [`brokerShutdownTimeoutMs`](reference-configuration.md#broker-brokerShutdownTimeoutMs) parameter: + +```shell + +$ pulsar-admin brokers update-dynamic-config --config brokerShutdownTimeoutMs --value 100 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/brokers/configuration/:configName/:configValue|operation/updateDynamicConfiguration?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().updateDynamicConfiguration(configName, configValue); + +``` + + + + +```` + +### List updated values + +Fetch a list of all potentially updatable configuration parameters. +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers list-dynamic-config +brokerShutdownTimeoutMs + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration|operation/getDynamicConfigurationName?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getDynamicConfigurationNames(); + +``` + + + + +```` + +### List all + +Fetch a list of all parameters that have been dynamically updated. + +````mdx-code-block + + + +```shell + +$ pulsar-admin brokers get-all-dynamic-config +brokerShutdownTimeoutMs:100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/brokers/configuration/values|operation/getAllDynamicConfigurations?version=@pulsar:version_number@} + + + + +```java + +admin.brokers().getAllDynamicConfigurations(); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-clusters.md b/site2/website/versioned_docs/version-2.9.x/admin-api-clusters.md new file mode 100644 index 0000000000000..1d0c5dc9786f5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-clusters.md @@ -0,0 +1,318 @@ +--- +id: admin-api-clusters +title: Managing Clusters +sidebar_label: "Clusters" +original_id: admin-api-clusters +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Pulsar clusters consist of one or more Pulsar [brokers](reference-terminology.md#broker), one or more [BookKeeper](reference-terminology.md#bookkeeper) +servers (aka [bookies](reference-terminology.md#bookie)), and a [ZooKeeper](https://zookeeper.apache.org) cluster that provides configuration and coordination management. + +Clusters can be managed via: + +* The `clusters` command of the [`pulsar-admin`]([reference-pulsar-admin.md](https://pulsar.apache.org/tools/pulsar-admin/)) tool +* The `/admin/v2/clusters` endpoint of the admin {@inject: rest:REST:/} API +* The `clusters` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +## Clusters resources + +### Provision + +New clusters can be provisioned using the admin interface. + +> Please note that this operation requires superuser privileges. + +````mdx-code-block + + + +You can provision a new cluster using the [`create`](reference-pulsar-admin.md#clusters-create) subcommand. Here's an example: + +```shell + +$ pulsar-admin clusters create cluster-1 \ + --url http://my-cluster.org.com:8080 \ + --broker-url pulsar://my-cluster.org.com:6650 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/clusters/:cluster|operation/createCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().createCluster(clusterName, clusterData); + +``` + + + + +```` + +### Initialize cluster metadata + +When provision a new cluster, you need to initialize that cluster's [metadata](concepts-architecture-overview.md#metadata-store). When initializing cluster metadata, you need to specify all of the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +You must initialize cluster metadata *before* starting up any [brokers](admin-api-brokers.md) that will belong to the cluster. + +> **No cluster metadata initialization through the REST API or the Java admin API** +> +> Unlike most other admin functions in Pulsar, cluster metadata initialization cannot be performed via the admin REST API +> or the admin Java client, as metadata initialization involves communicating with ZooKeeper directly. +> Instead, you can use the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool, in particular +> the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command. + +Here's an example cluster metadata initialization command: + +```shell + +bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +You'll need to use `--*-tls` flags only if you're using [TLS authentication](security-tls-authentication.md) in your instance. + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#clusters-get) subcommand and specify the name of the cluster. Here's an example: + +```shell + +$ pulsar-admin clusters get cluster-1 +{ + "serviceUrl": "http://my-cluster.org.com:8080/", + "serviceUrlTls": null, + "brokerServiceUrl": "pulsar://my-cluster.org.com:6650/", + "brokerServiceUrlTls": null + "peerClusterNames": null +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters/:cluster|operation/getCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getCluster(clusterName); + +``` + + + + +```` + +### Update + +You can update the configuration for an existing cluster at any time. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#clusters-update) subcommand and specify new configuration values using flags. + +```shell + +$ pulsar-admin clusters update cluster-1 \ + --url http://my-cluster.org.com:4081 \ + --broker-url pulsar://my-cluster.org.com:3350 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster|operation/updateCluster?version=@pulsar:version_number@} + + + + +```java + +ClusterData clusterData = new ClusterData( + serviceUrl, + serviceUrlTls, + brokerServiceUrl, + brokerServiceUrlTls +); +admin.clusters().updateCluster(clusterName, clusterData); + +``` + + + + +```` + +### Delete + +Clusters can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#clusters-delete) subcommand and specify the name of the cluster. + +``` + +$ pulsar-admin clusters delete cluster-1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/clusters/:cluster|operation/deleteCluster?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().deleteCluster(clusterName); + +``` + + + + +```` + +### List + +You can fetch a list of all clusters in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#clusters-list) subcommand. + +```shell + +$ pulsar-admin clusters list +cluster-1 +cluster-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/clusters|operation/getClusters?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().getClusters(); + +``` + + + + +```` + +### Update peer-cluster data + +Peer clusters can be configured for a given cluster in a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`update-peer-clusters`](reference-pulsar-admin.md#clusters-update-peer-clusters) subcommand and specify the list of peer-cluster names. + +``` + +$ pulsar-admin update-peer-clusters cluster-1 --peer-clusters cluster-2 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/clusters/:cluster/peers|operation/setPeerClusterNames?version=@pulsar:version_number@} + + + + +```java + +admin.clusters().updatePeerClusterNames(clusterName, peerClusterList); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-functions.md b/site2/website/versioned_docs/version-2.9.x/admin-api-functions.md new file mode 100644 index 0000000000000..d73386caf9b41 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-functions.md @@ -0,0 +1,830 @@ +--- +id: admin-api-functions +title: Manage Functions +sidebar_label: "Functions" +original_id: admin-api-functions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics +* apply a user-supplied processing logic to each message +* publish the results of the computation to another topic + +Functions can be managed via the following methods. + +Method | Description +---|--- +**Admin CLI** | The `functions` command of the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool. +**REST API** |The `/admin/v3/functions` endpoint of the admin {@inject: rest:REST:/} API. +**Java Admin API**| The `functions` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md). + +## Function resources + +You can perform the following operations on functions. + +### Create a function + +You can create a Pulsar function in cluster mode (deploy it on a Pulsar cluster) using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#functions-create) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --inputs test-input-topic \ + --output persistent://public/default/test-output-topic \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --jar /examples/api-examples.jar + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName|operation/registerFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +functionConfig.setProcessingGuarantees(FunctionConfig.ProcessingGuarantees.ATLEAST_ONCE); +functionConfig.setTopicsPattern(sourceTopicPattern); +functionConfig.setSubName(subscriptionName); +functionConfig.setAutoAck(true); +functionConfig.setOutput(sinkTopic); +admin.functions().createFunction(functionConfig, fileName); + +``` + + + + +```` + +### Update a function + +You can update a Pulsar function that has been deployed to a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#functions-update) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions update \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --output persistent://public/default/update-output-topic \ + # other options + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/functions/:tenant/:namespace/:functionName|operation/updateFunction?version=@pulsar:version_number@} + + + + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +UpdateOptions updateOptions = new UpdateOptions(); +updateOptions.setUpdateAuthData(updateAuthData); +admin.functions().updateFunction(functionConfig, userCodeFile, updateOptions); + +``` + + + + +```` + +### Start an instance of a function + +You can start a stopped function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Start all instances of a function + +You can start all stopped function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/start|operation/startFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().startFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Stop an instance of a function + +You can stop a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Stop all instances of a function + +You can stop all function instances using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/stop|operation/stopFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().stopFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Restart an instance of a function + +Restart a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Restart all instances of a function + +You can restart all function instances using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/restart|operation/restartFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().restartFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### List all functions + +You can list all Pulsar functions running under a specific tenant and namespace using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#functions-list) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace|operation/listFunctions?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctions(tenant, namespace); + +``` + + + + +```` + +### Delete a function + +You can delete a Pulsar function that is running on a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#functions-delete) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions delete \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/functions/:tenant/:namespace/:functionName|operation/deregisterFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().deleteFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get info about a function + +You can get information about a Pulsar function currently running in cluster mode using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#functions-get) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName|operation/getFunctionInfo?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunction(tenant, namespace, functionName); + +``` + + + + +```` + +### Get status of an instance of a function + +You can get the current status of a Pulsar function instance with `instance-id` using Admin CLI, REST API or Java Admin API. +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/status|operation/getFunctionInstanceStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get status of all instances of a function + +You can get the current status of a Pulsar function instance using Admin CLI, REST API or Java Admin API. + +````mdx-code-block + + + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/status|operation/getFunctionStatus?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStatus(tenant, namespace, functionName); + +``` + + + + +```` + +### Get stats of an instance of a function + +You can get the current stats of a Pulsar Function instance with `instance-id` using Admin CLI, REST API or Java admin API. +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/:instanceId/stats|operation/getFunctionInstanceStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName, Integer.parseInt(instanceId)); + +``` + + + + +```` + +### Get stats of all instances of a function + +You can get the current stats of a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/stats|operation/getFunctionStats?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionStats(tenant, namespace, functionName); + +``` + + + + +```` + +### Trigger a function + +You can trigger a specified Pulsar function with a supplied value using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`trigger`](reference-pulsar-admin.md#functions-trigger) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --topic (the name of input topic) \ + --trigger-value \"hello pulsar\" + # or --trigger-file (the path of trigger file) + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/trigger|operation/triggerFunction?version=@pulsar:version_number@} + + + + +```java + +admin.functions().triggerFunction(tenant, namespace, functionName, topic, triggerValue, triggerFile); + +``` + + + + +```` + +### Put state associated with a function + +You can put the state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`putstate`](reference-pulsar-admin.md#functions-putstate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions putstate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --state "{\"key\":\"pulsar\", \"stringValue\":\"hello pulsar\"}" + +``` + + + + +{@inject: endpoint|POST|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/putFunctionState?version=@pulsar:version_number@} + + + + +```java + +TypeReference typeRef = new TypeReference() {}; +FunctionState stateRepr = ObjectMapperFactory.getThreadLocal().readValue(state, typeRef); +admin.functions().putFunctionState(tenant, namespace, functionName, stateRepr); + +``` + + + + +```` + +### Fetch state associated with a function + +You can fetch the current state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +````mdx-code-block + + + +Use the [`querystate`](reference-pulsar-admin.md#functions-querystate) subcommand. + +**Example** + +```shell + +$ pulsar-admin functions querystate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --key (the key of state) + +``` + + + + +{@inject: endpoint|GET|/admin/v3/functions/:tenant/:namespace/:functionName/state/:key|operation/getFunctionState?version=@pulsar:version_number@} + + + + +```java + +admin.functions().getFunctionState(tenant, namespace, functionName, key); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-namespaces.md b/site2/website/versioned_docs/version-2.9.x/admin-api-namespaces.md new file mode 100644 index 0000000000000..fa6d9efe251ab --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-namespaces.md @@ -0,0 +1,1267 @@ +--- +id: admin-api-namespaces +title: Managing Namespaces +sidebar_label: "Namespaces" +original_id: admin-api-namespaces +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Pulsar [namespaces](reference-terminology.md#namespace) are logical groupings of [topics](reference-terminology.md#topic). + +Namespaces can be managed via: + +* The `namespaces` command of the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool +* The `/admin/v2/namespaces` endpoint of the admin {@inject: rest:REST:/} API +* The `namespaces` method of the `PulsarAdmin` object in the [Java API](client-libraries-java.md) + +## Namespaces resources + +### Create namespaces + +You can create new namespaces under a given [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#namespaces-create) subcommand and specify the namespace by name: + +```shell + +$ pulsar-admin namespaces create test-tenant/test-namespace + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace|operation/createNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().createNamespace(namespace); + +``` + + + + +```` + +### Get policies + +You can fetch the current policies associated with a namespace at any time. + +````mdx-code-block + + + +Use the [`policies`](reference-pulsar-admin.md#namespaces-policies) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces policies test-tenant/test-namespace +{ + "auth_policies": { + "namespace_auth": {}, + "destination_auth": {} + }, + "replication_clusters": [], + "bundles_activated": true, + "bundles": { + "boundaries": [ + "0x00000000", + "0xffffffff" + ], + "numBundles": 1 + }, + "backlog_quota_map": {}, + "persistence": null, + "latency_stats_sample_rate": {}, + "message_ttl_in_seconds": 0, + "retention_policies": null, + "deleted": false +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace|operation/getPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPolicies(namespace); + +``` + + + + +```` + +### List namespaces + +You can list all namespaces within a given Pulsar [tenant](reference-terminology.md#tenant). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#namespaces-list) subcommand and specify the tenant: + +```shell + +$ pulsar-admin namespaces list test-tenant +test-tenant/ns1 +test-tenant/ns2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant|operation/getTenantNamespaces?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaces(tenant); + +``` + + + + +```` + +### Delete namespaces + +You can delete existing namespaces from a tenant. + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#namespaces-delete) subcommand and specify the namespace: + +```shell + +$ pulsar-admin namespaces delete test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace|operation/deleteNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().deleteNamespace(namespace); + +``` + + + + +```` + +### Configure replication clusters + +#### Set replication cluster + +You can set replication clusters for a namespace to enable Pulsar to internally replicate the published messages from one colocation facility to another. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-clusters test-tenant/ns1 \ + --clusters cl1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replication|operation/setNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceReplicationClusters(namespace, clusters); + +``` + + + + +```` + +#### Get replication cluster + +You can get the list of replication clusters for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-clusters test-tenant/cl1/ns1 + +``` + +``` + +cl2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replication|operation/getNamespaceReplicationClusters?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceReplicationClusters(namespace) + +``` + + + + +```` + +### Configure backlog quota policies + +#### Set backlog quota policies + +Backlog quota helps the broker to restrict bandwidth/storage of a namespace once it reaches a certain threshold limit. Admin can set the limit and take corresponding action after the limit is reached. + + 1. producer_request_hold: broker holds but not persists produce request payload + + 2. producer_exception: broker disconnects with the client by giving an exception + + 3. consumer_backlog_eviction: broker starts discarding backlog messages + +Backlog quota restriction can be taken care by defining restriction of backlog-quota-type: destination_storage. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-backlog-quota --limit 10G --limitTime 36000 --policy producer_request_hold test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/setBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setBacklogQuota(namespace, new BacklogQuota(limit, limitTime, policy)) + +``` + + + + +```` + +#### Get backlog quota policies + +You can get a configured backlog quota for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-backlog-quotas test-tenant/ns1 + +``` + +```json + +{ + "destination_storage": { + "limit": 10, + "policy": "producer_request_hold" + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getBacklogQuotaMap(namespace); + +``` + + + + +```` + +#### Remove backlog quota policies + +You can remove backlog quota policies for a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-backlog-quota test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeBacklogQuota(namespace, backlogQuotaType) + +``` + + + + +```` + +### Configure persistence policies + +#### Set persistence policies + +Persistence policies allow users to configure persistency-level for all topic messages under a given namespace. + + - Bookkeeper-ack-quorum: Number of acks (guaranteed copies) to wait for each entry, default: 0 + + - Bookkeeper-ensemble: Number of bookies to use for a topic, default: 0 + + - Bookkeeper-write-quorum: How many writes to make of each entry, default: 0 + + - Ml-mark-delete-max-rate: Throttling rate of mark-delete operation (0 means no throttle), default: 0.0 + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-persistence --bookkeeper-ack-quorum 2 --bookkeeper-ensemble 3 --bookkeeper-write-quorum 2 --ml-mark-delete-max-rate 0 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setPersistence(namespace,new PersistencePolicies(bookkeeperEnsemble, bookkeeperWriteQuorum,bookkeeperAckQuorum,managedLedgerMaxMarkDeleteRate)) + +``` + + + + +```` + +#### Get persistence policies + +You can get the configured persistence policies of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-persistence test-tenant/ns1 + +``` + +```json + +{ + "bookkeeperEnsemble": 3, + "bookkeeperWriteQuorum": 2, + "bookkeeperAckQuorum": 2, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPersistence(namespace) + +``` + + + + +```` + +### Configure namespace bundles + +#### Unload namespace bundles + +The namespace bundle is a virtual group of topics which belong to the same namespace. If the broker gets overloaded with the number of bundles, this command can help unload a bundle from that broker, so it can be served by some other less-loaded brokers. The namespace bundle ID ranges from 0x00000000 to 0xffffffff. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces unload --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/unload|operation/unloadNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unloadNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +#### Split namespace bundles + +One namespace bundle can contain multiple topics but can be served by only one broker. If a single bundle is creating an excessive load on a broker, an admin can split the bundle using the command below, permitting one or more of the new bundles to be unloaded, thus balancing the load across the brokers. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces split-bundle --bundle 0x00000000_0xffffffff test-tenant/ns1 + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/:bundle/split|operation/splitNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().splitNamespaceBundle(namespace, bundle) + +``` + + + + +```` + +### Configure message TTL + +#### Set message-ttl + +You can configure the time to live (in seconds) duration for messages. In the example below, the message-ttl is set as 100s. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-message-ttl --messageTTL 100 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, messageTTL) + +``` + + + + +```` + +#### Get message-ttl + +When the message-ttl for a namespace is set, you can use the command below to get the configured value. This example comtinues the example of the command `set message-ttl`, so the returned value is 100(s). + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-message-ttl test-tenant/ns1 + +``` + +``` + +100 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + +``` + +100 + +``` + + + + +```` + +#### Remove message-ttl + +Remove a message TTL of the configured namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-message-ttl test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + + + + +```` + + +### Clear backlog + +#### Clear namespace backlog + +It clears all message backlog for all the topics that belong to a specific namespace. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --sub my-subscription test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/clearBacklog|operation/clearNamespaceBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBacklogForSubscription(namespace, subscription) + +``` + + + + +```` + +#### Clear bundle backlog + +It clears all message backlog for all the topics that belong to a specific NamespaceBundle. You can also clear backlog for a specific subscription as well. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces clear-backlog --bundle 0x00000000_0xffffffff --sub my-subscription test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/:bundle/clearBacklog|operation/clearNamespaceBundleBacklogForSubscription?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().clearNamespaceBundleBacklogForSubscription(namespace, bundle, subscription) + +``` + + + + +```` + +### Configure retention + +#### Set retention + +Each namespace contains multiple topics and the retention size (storage size) of each topic should not exceed a specific threshold or it should be stored for a certain period. This command helps configure the retention size and time of topics in a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-retention --size 100 --time 10 test-tenant/ns1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setRetention(namespace, new RetentionPolicies(retentionTimeInMin, retentionSizeInMB)) + +``` + + + + +```` + +#### Get retention + +It shows retention information of a given namespace. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-retention test-tenant/ns1 + +``` + +```json + +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 100 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getRetention(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for topics + +#### Set dispatch throttling for topics + +It sets message dispatch rate for all the topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +:::note + +- If neither `clusterDispatchRate` nor `topicDispatchRate` is configured, dispatch throttling is disabled. +- If `topicDispatchRate` is not configured, `clusterDispatchRate` takes effect. +- If `topicDispatchRate` is configured, `topicDispatchRate` takes effect. + +::: + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for topics + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/dispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for subscription + +#### Set dispatch throttling for subscription + +It sets message dispatch rate for all the subscription of topics under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-subscription-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setSubscriptionDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for subscription + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-subscription-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/subscriptionDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getSubscriptionDispatchRate(namespace) + +``` + + + + +```` + +### Configure dispatch throttling for replicator + +#### Set dispatch throttling for replicator + +It sets message dispatch rate for all the replicator between replication clusters under a given namespace. +The dispatch rate can be restricted by the number of messages per X seconds (`msg-dispatch-rate`) or by the number of message-bytes per X second (`byte-dispatch-rate`). +dispatch rate is in second and it can be configured with `dispatch-rate-period`. Default value of `msg-dispatch-rate` and `byte-dispatch-rate` is -1 which +disables the throttling. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-replicator-dispatch-rate test-tenant/ns1 \ + --msg-dispatch-rate 1000 \ + --byte-dispatch-rate 1048576 \ + --dispatch-rate-period 1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/setDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setReplicatorDispatchRate(namespace, new DispatchRate(1000, 1048576, 1)) + +``` + + + + +```` + +#### Get configured message-rate for replicator + +It shows configured message-rate for the namespace (topics under this namespace can dispatch this many messages per second) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-replicator-dispatch-rate test-tenant/ns1 + +``` + +```json + +{ + "dispatchThrottlingRatePerTopicInMsg" : 1000, + "dispatchThrottlingRatePerTopicInByte" : 1048576, + "ratePeriodInSecond" : 1 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/replicatorDispatchRate|operation/getDispatchRate?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getReplicatorDispatchRate(namespace) + +``` + + + + +```` + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +It shows configured `deduplicationSnapshotInterval` for a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces get-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +Set configured `deduplicationSnapshotInterval` for a namespace. Each topic under the namespace will take a deduplication snapshot according to this interval. +`brokerDeduplicationEnabled` must be set to `true` for this property to take effect. + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces set-deduplication-snapshot-interval test-tenant/ns1 --interval 1000 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setDeduplicationSnapshotInterval(namespace, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +Remove configured `deduplicationSnapshotInterval` of a namespace (Each topic under the namespace will take a deduplication snapshot according to this interval) + +````mdx-code-block + + + +``` + +$ pulsar-admin namespaces remove-deduplication-snapshot-interval test-tenant/ns1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeDeduplicationSnapshotInterval(namespace) + +``` + + + + +```` + +### Namespace isolation + +You can use the [Pulsar isolation policy](administration-isolation.md) to allocate resources (broker and bookie) for a namespace. + +### Unload namespaces from a broker + +You can unload a namespace, or a [namespace bundle](reference-terminology.md#namespace-bundle), from the Pulsar [broker](reference-terminology.md#broker) that is currently responsible for it. + +#### pulsar-admin + +Use the [`unload`](reference-pulsar-admin.md#unload) subcommand of the [`namespaces`](reference-pulsar-admin.md#namespaces) command. + +````mdx-code-block + + + +```shell + +$ pulsar-admin namespaces unload my-tenant/my-ns + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/namespaces/:tenant/:namespace/unload|operation/unloadNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().unload(namespace) + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-non-partitioned-topics.md b/site2/website/versioned_docs/version-2.9.x/admin-api-non-partitioned-topics.md new file mode 100644 index 0000000000000..e6347bb8c363a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-non-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-partitioned-topics +title: Managing non-partitioned topics +sidebar_label: "Non-partitioned topics" +original_id: admin-api-non-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-non-persistent-topics.md b/site2/website/versioned_docs/version-2.9.x/admin-api-non-persistent-topics.md new file mode 100644 index 0000000000000..3126a6494c715 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-non-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-non-persistent-topics +title: Managing non-persistent topics +sidebar_label: "Non-Persistent topics" +original_id: admin-api-non-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-overview.md b/site2/website/versioned_docs/version-2.9.x/admin-api-overview.md new file mode 100644 index 0000000000000..1154c625aff7b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-overview.md @@ -0,0 +1,144 @@ +--- +id: admin-api-overview +title: Pulsar admin interface +sidebar_label: "Overview" +original_id: admin-api-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +The Pulsar admin interface enables you to manage all important entities in a Pulsar instance, such as tenants, topics, and namespaces. + +You can interact with the admin interface via: + +- The `pulsar-admin` CLI tool, which is available in the `bin` folder of your Pulsar installation: + + ```shell + + bin/pulsar-admin + + ``` + + > **Important** + > + > For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). + +- HTTP calls, which are made against the admin {@inject: rest:REST:/} API provided by Pulsar brokers. For some RESTful APIs, they might be redirected to the owner brokers for serving with [`307 Temporary Redirect`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/307), hence the HTTP callers should handle `307 Temporary Redirect`. If you use `curl` commands, you should specify `-L` to handle redirections. + + > **Important** + > + > For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. + +- A Java client interface. + + > **Important** + > + > For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +> **The REST API is the admin interface**. Both the `pulsar-admin` CLI tool and the Java client use the REST API. If you implement your own admin interface client, you should use the REST API. + +## Admin setup + +Each of the three admin interfaces (the `pulsar-admin` CLI tool, the {@inject: rest:REST:/} API, and the [Java admin API](/api/admin)) requires some special setup if you have enabled authentication in your Pulsar instance. + +````mdx-code-block + + + +If you have enabled authentication, you need to provide an auth configuration to use the `pulsar-admin` tool. By default, the configuration for the `pulsar-admin` tool is in the [`conf/client.conf`](reference-configuration.md#client) file. The following are the available parameters: + +|Name|Description|Default| +|----|-----------|-------| +|webServiceUrl|The web URL for the cluster.|http://localhost:8080/| +|brokerServiceUrl|The Pulsar protocol URL for the cluster.|pulsar://localhost:6650/| +|authPlugin|The authentication plugin.| | +|authParams|The authentication parameters for the cluster, as a comma-separated string.| | +|useTls|Whether or not TLS authentication will be enforced in the cluster.|false| +|tlsAllowInsecureConnection|Accept untrusted TLS certificate from client.|false| +|tlsTrustCertsFilePath|Path for the trusted TLS certificate file.| | + + + + +You can find details for the REST API exposed by Pulsar brokers in this {@inject: rest:document:/}. + + + + +To use the Java admin API, instantiate a {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object, and specify a URL for a Pulsar broker and a {@inject: javadoc:PulsarAdminBuilder:/admin/org/apache/pulsar/client/admin/PulsarAdminBuilder}. The following is a minimal example using `localhost`: + +```java + +String url = "http://localhost:8080"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + +If you use multiple brokers, you can use multi-host like Pulsar service. For example, + +```java + +String url = "http://localhost:8080,localhost:8081,localhost:8082"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); + +``` + + + + +```` + +## How to define Pulsar resource names when running Pulsar in Kubernetes +If you run Pulsar Functions or connectors on Kubernetes, you need to follow Kubernetes naming convention to define the names of your Pulsar resources, whichever admin interface you use. + +Kubernetes requires a name that can be used as a DNS subdomain name as defined in [RFC 1123](https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names). Pulsar supports more legal characters than Kubernetes naming convention. If you create a Pulsar resource name with special characters that are not supported by Kubernetes (for example, including colons in a Pulsar namespace name), Kubernetes runtime translates the Pulsar object names into Kubernetes resource labels which are in RFC 1123-compliant forms. Consequently, you can run functions or connectors using Kubernetes runtime. The rules for translating Pulsar object names into Kubernetes resource labels are as below: + +- Truncate to 63 characters + +- Replace the following characters with dashes (-): + + - Non-alphanumeric characters + + - Underscores (_) + + - Dots (.) + +- Replace beginning and ending non-alphanumeric characters with 0 + +:::tip + +- If you get an error in translating Pulsar object names into Kubernetes resource labels (for example, you may have a naming collision if your Pulsar object name is too long) or want to customize the translating rules, see [customize Kubernetes runtime](https://pulsar.apache.org/docs/en/next/functions-runtime/#customize-kubernetes-runtime). +- For how to configure Kubernetes runtime, see [here](https://pulsar.apache.org/docs/en/next/functions-runtime/#configure-kubernetes-runtime). + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-packages.md b/site2/website/versioned_docs/version-2.9.x/admin-api-packages.md new file mode 100644 index 0000000000000..2852fb74a02be --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-packages.md @@ -0,0 +1,391 @@ +--- +id: admin-api-packages +title: Manage packages +sidebar_label: "Packages" +original_id: admin-api-packages +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Package management enables version management and simplifies the upgrade and rollback processes for Functions, Sinks, and Sources. When you use the same function, sink and source in different namespaces, you can upload them to a common package management system. + +## Package name + +A `package` is identified by five parts: `type`, `tenant`, `namespace`, `package name`, and `version`. + +| Part | Description | +|-------|-------------| +|`type` |The type of the package. The following types are supported: `function`, `sink` and `source`. | +| `name`|The fully qualified name of the package: `//`.| +|`version`|The version of the package.| + +The following is a code sample. + +```java + +class PackageName { + private final PackageType type; + private final String namespace; + private final String tenant; + private final String name; + private final String version; +} + +enum PackageType { + FUNCTION("function"), SINK("sink"), SOURCE("source"); +} + +``` + +## Package URL +A package is located using a URL. The package URL is written in the following format: + +```shell + +:////@ + +``` + +The following are package URL examples: + +`sink://public/default/mysql-sink@1.0` +`function://my-tenant/my-ns/my-function@0.1` +`source://my-tenant/my-ns/mysql-cdc-source@2.3` + +The package management system stores the data, versions and metadata of each package. The metadata is shown in the following table. + +| metadata | Description | +|----------|-------------| +|description|The description of the package.| +|contact |The contact information of a package. For example, team email.| +|create_time| The time when the package is created.| +|modification_time| The time when the package is modified.| +|properties |A key/value map that stores your own information.| + +## Permissions + +The packages are organized by the tenant and namespace, so you can apply the tenant and namespace permissions to packages directly. + +## Package resources +You can use the package management with command line tools, REST API and Java client. + +### Upload a package +You can upload a package to the package management service in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages upload functions://public/default/example@v0.1 --path package-file --description package-description + +``` + + + + +{@inject: endpoint|POST|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/upload?version=@pulsar:version_number@} + + + + +Upload a package to the package management service synchronously. + +```java + + void upload(PackageMetadata metadata, String packageName, String path) throws PulsarAdminException; + +``` + +Upload a package to the package management service asynchronously. + +```java + + CompletableFuture uploadAsync(PackageMetadata metadata, String packageName, String path); + +``` + + + + +```` + +### Download a package +You can download a package to the package management service in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages download functions://public/default/example@v0.1 --path package-file + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/download?version=@pulsar:version_number@} + + + + +Download a package to the package management service synchronously. + +```java + + void download(String packageName, String path) throws PulsarAdminException; + +``` + +Download a package to the package management service asynchronously. + +```java + + CompletableFuture downloadAsync(String packageName, String path); + +``` + + + + +```` + +### List all versions of a package +You can get a list of all versions of a package in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages list --type function public/default + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName|operation/listPackageVersion?version=@pulsar:version_number@} + + + + +List all versions of a package synchronously. + +```java + + List listPackageVersions(String packageName) throws PulsarAdminException; + +``` + +List all versions of a package asynchronously. + +```java + + CompletableFuture> listPackageVersionsAsync(String packageName); + +``` + + + + +```` + +### List all the specified type packages under a namespace +You can get a list of all the packages with the given type in a namespace in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages list --type function public/default + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace|operation/listPackages?version=@pulsar:version_number@} + + + + +List all the packages with the given type in a namespace synchronously. + +```java + + List listPackages(String type, String namespace) throws PulsarAdminException; + +``` + +List all the packages with the given type in a namespace asynchronously. + +```java + + CompletableFuture> listPackagesAsync(String type, String namespace); + +``` + + + + +```` + +### Get the metadata of a package +You can get the metadata of a package in the following ways. + +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages get-metadata function://public/default/test@v1 + +``` + + + + +{@inject: endpoint|GET|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/getMeta?version=@pulsar:version_number@} + + + + +Get the metadata of a package synchronously. + +```java + + PackageMetadata getMetadata(String packageName) throws PulsarAdminException; + +``` + +Get the metadata of a package asynchronously. + +```java + + CompletableFuture getMetadataAsync(String packageName); + +``` + + + + +```` + +### Update the metadata of a package +You can update the metadata of a package in the following ways. +````mdx-code-block + + + +```shell + +bin/pulsar-admin packages update-metadata function://public/default/example@v0.1 --description update-description + +``` + + + + +{@inject: endpoint|PUT|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version/metadata|operation/updateMeta?version=@pulsar:version_number@} + + + + +Update a package metadata information synchronously. + +```java + + void updateMetadata(String packageName, PackageMetadata metadata) throws PulsarAdminException; + +``` + +Update a package metadata information asynchronously. + +```java + + CompletableFuture updateMetadataAsync(String packageName, PackageMetadata metadata); + +``` + + + + +```` + +### Delete a specified package +You can delete a specified package with its package name in the following ways. + +````mdx-code-block + + + +The following command example deletes a package of version 0.1. + +```shell + +bin/pulsar-admin packages delete functions://public/default/example@v0.1 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v3/packages/:type/:tenant/:namespace/:packageName/:version|operation/delete?version=@pulsar:version_number@} + + + + +Delete a specified package synchronously. + +```java + + void delete(String packageName) throws PulsarAdminException; + +``` + +Delete a specified package asynchronously. + +```java + + CompletableFuture deleteAsync(String packageName); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-partitioned-topics.md b/site2/website/versioned_docs/version-2.9.x/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000..5ce182282e032 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-partitioned-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: "Partitioned topics" +original_id: admin-api-partitioned-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-permissions.md b/site2/website/versioned_docs/version-2.9.x/admin-api-permissions.md new file mode 100644 index 0000000000000..2496c9be54eb2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-permissions.md @@ -0,0 +1,184 @@ +--- +id: admin-api-permissions +title: Managing permissions +sidebar_label: "Permissions" +original_id: admin-api-permissions +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Permissions in Pulsar are managed at the [namespace](reference-terminology.md#namespace) level +(that is, within [tenants](reference-terminology.md#tenant) and [clusters](reference-terminology.md#cluster)). + +## Grant permissions + +You can grant permissions to specific roles for lists of operations such as `produce` and `consume`. + +````mdx-code-block + + + +Use the [`grant-permission`](reference-pulsar-admin.md#grant-permission) subcommand and specify a namespace, actions using the `--actions` flag, and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role admin10 + +``` + +Wildcard authorization can be performed when `authorizationAllowWildcardsMatching` is set to `true` in `broker.conf`. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.role.*' + +``` + +Then, roles `my.role.1`, `my.role.2`, `my.role.foo`, `my.role.bar`, etc. can produce and consume. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role '*.role.my' + +``` + +Then, roles `1.role.my`, `2.role.my`, `foo.role.my`, `bar.role.my`, etc. can produce and consume. + +**Note**: A wildcard matching works at **the beginning or end of the role name only**. + +e.g. + +```shell + +$ pulsar-admin namespaces grant-permission test-tenant/ns1 \ + --actions produce,consume \ + --role 'my.*.role' + +``` + +In this case, only the role `my.*.role` has permissions. +Roles `my.1.role`, `my.2.role`, `my.foo.role`, `my.bar.role`, etc. **cannot** produce and consume. + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/grantPermissionOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().grantPermissionOnNamespace(namespace, role, getAuthActions(actions)); + +``` + + + + +```` + +## Get permissions + +You can see which permissions have been granted to which roles in a namespace. + +````mdx-code-block + + + +Use the [`permissions`](reference-pulsar-admin#permissions) subcommand and specify a namespace: + +```shell + +$ pulsar-admin namespaces permissions test-tenant/ns1 +{ + "admin10": [ + "produce", + "consume" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/permissions|operation/getPermissions?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getPermissions(namespace); + +``` + + + + +```` + +## Revoke permissions + +You can revoke permissions from specific roles, which means that those roles will no longer have access to the specified namespace. + +````mdx-code-block + + + +Use the [`revoke-permission`](reference-pulsar-admin.md#revoke-permission) subcommand and specify a namespace and a role using the `--role` flag: + +```shell + +$ pulsar-admin namespaces revoke-permission test-tenant/ns1 \ + --role admin10 + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/permissions/:role|operation/revokePermissionsOnNamespace?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().revokePermissionsOnNamespace(namespace, role); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-persistent-topics.md b/site2/website/versioned_docs/version-2.9.x/admin-api-persistent-topics.md new file mode 100644 index 0000000000000..50d135b72f542 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-persistent-topics.md @@ -0,0 +1,8 @@ +--- +id: admin-api-persistent-topics +title: Managing persistent topics +sidebar_label: "Persistent topics" +original_id: admin-api-persistent-topics +--- + +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-schemas.md b/site2/website/versioned_docs/version-2.9.x/admin-api-schemas.md new file mode 100644 index 0000000000000..9ffe21f5b0f75 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-schemas.md @@ -0,0 +1,7 @@ +--- +id: admin-api-schemas +title: Managing Schemas +sidebar_label: "Schemas" +original_id: admin-api-schemas +--- + diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-tenants.md b/site2/website/versioned_docs/version-2.9.x/admin-api-tenants.md new file mode 100644 index 0000000000000..3e13e54a68b2c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-tenants.md @@ -0,0 +1,238 @@ +--- +id: admin-api-tenants +title: Managing Tenants +sidebar_label: "Tenants" +original_id: admin-api-tenants +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Tenants, like namespaces, can be managed using the [admin API](admin-api-overview.md). There are currently two configurable aspects of tenants: + +* Admin roles +* Allowed clusters + +## Tenant resources + +### List + +You can list all of the tenants associated with an [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`list`](reference-pulsar-admin.md#tenants-list) subcommand. + +```shell + +$ pulsar-admin tenants list +my-tenant-1 +my-tenant-2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants|operation/getTenants?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenants(); + +``` + + + + +```` + +### Create + +You can create a new tenant. + +````mdx-code-block + + + +Use the [`create`](reference-pulsar-admin.md#tenants-create) subcommand: + +```shell + +$ pulsar-admin tenants create my-tenant + +``` + +When creating a tenant, you can assign admin roles using the `-r`/`--admin-roles` flag. You can specify multiple roles as a comma-separated list. Here are some examples: + +```shell + +$ pulsar-admin tenants create my-tenant \ + --admin-roles role1,role2,role3 + +$ pulsar-admin tenants create my-tenant \ + -r role1 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/tenants/:tenant|operation/createTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().createTenant(tenantName, tenantInfo); + +``` + + + + +```` + +### Get configuration + +You can fetch the [configuration](reference-configuration.md) for an existing tenant at any time. + +````mdx-code-block + + + +Use the [`get`](reference-pulsar-admin.md#tenants-get) subcommand and specify the name of the tenant. Here's an example: + +```shell + +$ pulsar-admin tenants get my-tenant +{ + "adminRoles": [ + "admin1", + "admin2" + ], + "allowedClusters": [ + "cl1", + "cl2" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/tenants/:cluster|operation/getTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().getTenantInfo(tenantName); + +``` + + + + +```` + +### Delete + +Tenants can be deleted from a Pulsar [instance](reference-terminology.md#instance). + +````mdx-code-block + + + +Use the [`delete`](reference-pulsar-admin.md#tenants-delete) subcommand and specify the name of the tenant. + +```shell + +$ pulsar-admin tenants delete my-tenant + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/tenants/:cluster|operation/deleteTenant?version=@pulsar:version_number@} + + + + +```java + +admin.Tenants().deleteTenant(tenantName); + +``` + + + + +```` + +### Update + +You can update a tenant's configuration. + +````mdx-code-block + + + +Use the [`update`](reference-pulsar-admin.md#tenants-update) subcommand. + +```shell + +$ pulsar-admin tenants update my-tenant + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/tenants/:cluster|operation/updateTenant?version=@pulsar:version_number@} + + + + +```java + +admin.tenants().updateTenant(tenantName, tenantInfo); + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/admin-api-topics.md b/site2/website/versioned_docs/version-2.9.x/admin-api-topics.md new file mode 100644 index 0000000000000..7a7316fce8a9b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/admin-api-topics.md @@ -0,0 +1,2334 @@ +--- +id: admin-api-topics +title: Manage topics +sidebar_label: "Topics" +original_id: admin-api-topics +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +> **Important** +> +> This page only shows **some frequently used operations**. +> +> - For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) +> +> - For the latest and complete information about `REST API`, including parameters, responses, samples, and more, see {@inject: rest:REST:/} API doc. +> +> - For the latest and complete information about `Java admin API`, including classes, methods, descriptions, and more, see [Java admin API doc](https://pulsar.apache.org/api/admin/). + +Pulsar has persistent and non-persistent topics. Persistent topic is a logical endpoint for publishing and consuming messages. The topic name structure for persistent topics is: + +```shell + +persistent://tenant/namespace/topic + +``` + +Non-persistent topics are used in applications that only consume real-time published messages and do not need persistent guarantee. In this way, it reduces message-publish latency by removing overhead of persisting messages. The topic name structure for non-persistent topics is: + +```shell + +non-persistent://tenant/namespace/topic + +``` + +## Manage topic resources +Whether it is persistent or non-persistent topic, you can obtain the topic resources through `pulsar-admin` tool, REST API and Java. + +:::note + +In REST API, `:schema` stands for persistent or non-persistent. `:tenant`, `:namespace`, `:x` are variables, replace them with the real tenant, namespace, and `x` names when using them. +Take {@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} as an example, to get the list of persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/persistent/my-tenant/my-namespace`. To get the list of non-persistent topics in REST API, use `https://pulsar.apache.org/admin/v2/non-persistent/my-tenant/my-namespace`. + +::: + +### List of topics + +You can get the list of topics under a given namespace in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list \ + my-tenant/my-namespace + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +String namespace = "my-tenant/my-namespace"; +admin.topics().getList(namespace); + +``` + + + + +```` + +### Grant permission + +You can grant permissions on a client role to perform specific actions on a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics grant-permission \ + --actions produce,consume --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/grantPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +Set actions = Sets.newHashSet(AuthAction.produce, AuthAction.consume); +admin.topics().grantPermission(topic, role, actions); + +``` + + + + +```` + +### Get permission + +You can fetch permission in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics permissions \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/permissions|operation/getPermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPermissions(topic); + +``` + + + + +```` + +### Revoke permission + +You can revoke a permission granted on a client role in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics revoke-permission \ + --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic/permissions/:role|operation/revokePermissionsOnTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +admin.topics().revokePermissions(topic, role); + +``` + + + + +```` + +### Delete topic + +You can delete a topic in the following ways. You cannot delete a topic if any active subscription or producers is connected to the topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics delete \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().delete(topic); + +``` + + + + +```` + +### Unload topic + +You can unload a topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics unload \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/unload|operation/unloadTopic?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().unload(topic); + +``` + + + + +```` + +### Get stats + +You can check the following statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates (msg/s). + + - **msgThroughputIn**: The sum of all local and replication publishers' publish rates (bytes/s). + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates(msg/s). + + - **msgThroughputOut**: The sum of all local and replication consumers' dispatch rates (bytes/s). + + - **averageMsgSize**: The average size (in bytes) of messages published within the last interval. + + - **storageSize**: The sum of the ledgers' storage size for this topic. The space used to store the messages for the topic. + + - **bytesInCounter**: Total bytes published to the topic. + + - **msgInCounter**: Total messages published to the topic. + + - **bytesOutCounter**: Total bytes delivered to consumers. + + - **msgOutCounter**: Total messages delivered to consumers. + + - **msgChunkPublished**: Topic has chunked message published on it. + + - **backlogSize**: Estimated total unconsumed or backlog size (in bytes). + + - **offloadedStorageSize**: Space used to store the offloaded messages for the topic (in bytes). + + - **waitingPublishers**: The number of publishers waiting in a queue in exclusive access mode. + + - **deduplicationStatus**: The status of message deduplication for the topic. + + - **topicEpoch**: The topic epoch or empty if not set. + + - **nonContiguousDeletedMessagesRanges**: The number of non-contiguous deleted messages ranges. + + - **nonContiguousDeletedMessagesRangesSerializedSize**: The serialized size of non-contiguous deleted messages ranges. + + - **publishers**: The list of all local publishers into the topic. The list ranges from zero to thousands. + + - **accessMode**: The type of access to the topic that the producer requires. + + - **msgRateIn**: The total rate of messages (msg/s) published by this publisher. + + - **msgThroughputIn**: The total throughput (bytes/s) of the messages published by this publisher. + + - **averageMsgSize**: The average message size in bytes from this publisher within the last interval. + + - **chunkedMessageRate**: The total rate of chunked messages published by this publisher. + + - **producerId**: The internal identifier for this producer on this topic. + + - **producerName**: The internal identifier for this producer, generated by the client library. + + - **address**: The IP address and source port for the connection of this producer. + + - **connectedSince**: The timestamp when this producer is created or reconnected last time. + + - **clientVersion**: The client library version of this producer. + + - **metadata**: Metadata (key/value strings) associated with this publisher. + + - **subscriptions**: The list of all local subscriptions to the topic. + + - **my-subscription**: The name of this subscription. It is defined by the client. + + - **msgRateOut**: The total rate of messages (msg/s) delivered on this subscription. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered on this subscription. + + - **msgBacklog**: The number of messages in the subscription backlog. + + - **type**: The subscription type. + + - **msgRateExpired**: The rate at which messages were discarded instead of dispatched from this subscription due to TTL. + + - **lastExpireTimestamp**: The timestamp of the last message expire execution. + + - **lastConsumedFlowTimestamp**: The timestamp of the last flow command received. + + - **lastConsumedTimestamp**: The latest timestamp of all the consumed timestamp of the consumers. + + - **lastAckedTimestamp**: The latest timestamp of all the acked timestamp of the consumers. + + - **bytesOutCounter**: Total bytes delivered to consumer. + + - **msgOutCounter**: Total messages delivered to consumer. + + - **msgRateRedeliver**: Total rate of messages redelivered on this subscription (msg/s). + + - **chunkedMessageRate**: Chunked message dispatch rate. + + - **backlogSize**: Size of backlog for this subscription (in bytes). + + - **msgBacklogNoDelayed**: Number of messages in the subscription backlog that do not contain the delay messages. + + - **blockedSubscriptionOnUnackedMsgs**: Flag to verify if a subscription is blocked due to reaching threshold of unacked messages. + + - **msgDelayed**: Number of delayed messages currently being tracked. + + - **unackedMessages**: Number of unacknowledged messages for the subscription, where an unacknowledged message is one that has been sent to a consumer but not yet acknowledged. This field is only meaningful when using a subscription that tracks individual message acknowledgement. + + - **activeConsumerName**: The name of the consumer that is active for single active consumer subscriptions. For example, failover or exclusive. + + - **totalMsgExpired**: Total messages expired on this subscription. + + - **lastMarkDeleteAdvancedTimestamp**: Last MarkDelete position advanced timestamp. + + - **durable**: Whether the subscription is durable or ephemeral (for example, from a reader). + + - **replicated**: Mark that the subscription state is kept in sync across different regions. + + - **allowOutOfOrderDelivery**: Whether out of order delivery is allowed on the Key_Shared subscription. + + - **keySharedMode**: Whether the Key_Shared subscription mode is AUTO_SPLIT or STICKY. + + - **consumersAfterMarkDeletePosition**: This is for Key_Shared subscription to get the recentJoinedConsumers in the Key_Shared subscription. + + - **nonContiguousDeletedMessagesRanges**: The number of non-contiguous deleted messages ranges. + + - **nonContiguousDeletedMessagesRangesSerializedSize**: The serialized size of non-contiguous deleted messages ranges. + + - **consumers**: The list of connected consumers for this subscription. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the consumer. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the consumer. + + - **consumerName**: The internal identifier for this consumer, generated by the client library. + + - **availablePermits**: The number of messages that the consumer has space for in the client library's listen queue. `0` means the client library's queue is full and `receive()` isn't being called. A non-zero value means this consumer is ready for dispatched messages. + + - **unackedMessages**: The number of unacknowledged messages for the consumer, where an unacknowledged message is one that has been sent to the consumer but not yet acknowledged. This field is only meaningful when using a subscription that tracks individual message acknowledgement. + + - **blockedConsumerOnUnackedMsgs**: The flag used to verify if the consumer is blocked due to reaching threshold of the unacknowledged messages. + + - **lastConsumedTimestamp**: The timestamp when the consumer reads a message the last time. + + - **lastAckedTimestamp**: The timestamp when the consumer acknowledges a message the last time. + + - **address**: The IP address and source port for the connection of this consumer. + + - **connectedSince**: The timestamp when this consumer is created or reconnected last time. + + - **clientVersion**: The client library version of this consumer. + + - **bytesOutCounter**: Total bytes delivered to consumer. + + - **msgOutCounter**: Total messages delivered to consumer. + + - **msgRateRedeliver**: Total rate of messages redelivered by this consumer (msg/s). + + - **chunkedMessageRate**: The total rate of chunked messages delivered to this consumer. + + - **avgMessagesPerEntry**: Number of average messages per entry for the consumer consumed. + + - **readPositionWhenJoining**: The read position of the cursor when the consumer joining. + + - **keyHashRanges**: Hash ranges assigned to this consumer if is Key_Shared sub mode. + + - **metadata**: Metadata (key/value strings) associated with this consumer. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **msgRateIn**: The total rate (msg/s) of messages received from the remote cluster. + + - **msgThroughputIn**: The total throughput (bytes/s) received from the remote cluster. + + - **msgRateOut**: The total rate of messages (msg/s) delivered to the replication-subscriber. + + - **msgThroughputOut**: The total throughput (bytes/s) delivered to the replication-subscriber. + + - **msgRateExpired**: The total rate of messages (msg/s) expired. + + - **replicationBacklog**: The number of messages pending to be replicated to remote cluster. + + - **connected**: Whether the outbound replicator is connected. + + - **replicationDelayInSeconds**: How long the oldest message has been waiting to be sent through the connection, if connected is `true`. + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker. + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **outboundConnection**: The address of the outbound replication connection. + + - **outboundConnectedSince**: The timestamp of establishing outbound connection. + +The following is an example of a topic status. + +```json + +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesInCounter" : 504, + "msgInCounter" : 9, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "averageMsgSize" : 0.0, + "msgChunkPublished" : false, + "storageSize" : 504, + "backlogSize" : 0, + "offloadedStorageSize" : 0, + "publishers" : [ { + "accessMode" : "Shared", + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "averageMsgSize" : 0.0, + "chunkedMessageRate" : 0.0, + "producerId" : 0, + "metadata" : { }, + "address" : "/127.0.0.1:65402", + "connectedSince" : "2021-06-09T17:22:55.913+08:00", + "clientVersion" : "2.9.0-SNAPSHOT", + "producerName" : "standalone-1-0" + } ], + "waitingPublishers" : 0, + "subscriptions" : { + "sub-demo" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "msgRateRedeliver" : 0.0, + "chunkedMessageRate" : 0, + "msgBacklog" : 0, + "backlogSize" : 0, + "msgBacklogNoDelayed" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Exclusive", + "activeConsumerName" : "20b81", + "msgRateExpired" : 0.0, + "totalMsgExpired" : 0, + "lastExpireTimestamp" : 0, + "lastConsumedFlowTimestamp" : 1623230565356, + "lastConsumedTimestamp" : 1623230583946, + "lastAckedTimestamp" : 1623230584033, + "lastMarkDeleteAdvancedTimestamp" : 1623230584033, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 2296, + "msgOutCounter" : 41, + "msgRateRedeliver" : 0.0, + "chunkedMessageRate" : 0.0, + "consumerName" : "20b81", + "availablePermits" : 959, + "unackedMessages" : 0, + "avgMessagesPerEntry" : 314, + "blockedConsumerOnUnackedMsgs" : false, + "lastAckedTimestamp" : 1623230584033, + "lastConsumedTimestamp" : 1623230583946, + "metadata" : { }, + "address" : "/127.0.0.1:65172", + "connectedSince" : "2021-06-09T17:22:45.353+08:00", + "clientVersion" : "2.9.0-SNAPSHOT" + } ], + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition" : { }, + "nonContiguousDeletedMessagesRanges" : 0, + "nonContiguousDeletedMessagesRangesSerializedSize" : 0, + "durable" : true, + "replicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled", + "nonContiguousDeletedMessagesRanges" : 0, + "nonContiguousDeletedMessagesRangesSerializedSize" : 0 +} + +``` + +To get the status of a topic, you can use the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getStats(topic); + +``` + + + + +```` + +### Get internal stats + +You can get the detailed statistics of a topic. + + - **entriesAddedCounter**: Messages published since this broker loaded this topic. + + - **numberOfEntries**: The total number of messages being tracked. + + - **totalSize**: The total storage size in bytes of all messages. + + - **currentLedgerEntries**: The count of messages written to the ledger that is currently open for writing. + + - **currentLedgerSize**: The size in bytes of messages written to the ledger that is currently open for writing. + + - **lastLedgerCreatedTimestamp**: The time when the last ledger is created. + + - **lastLedgerCreationFailureTimestamp:** The time when the last ledger failed. + + - **waitingCursorsCount**: The number of cursors that are "caught up" and waiting for a new message to be published. + + - **pendingAddEntriesCount**: The number of messages that complete (asynchronous) write requests. + + - **lastConfirmedEntry**: The ledgerid:entryid of the last message that is written successfully. If the entryid is `-1`, then the ledger is open, yet no entries are written. + + - **state**: The state of this ledger for writing. The state `LedgerOpened` means that a ledger is open for saving published messages. + + - **ledgers**: The ordered list of all ledgers for this topic holding messages. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **schemaLedgers**: The ordered list of all ledgers for this topic schema. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. + + - **metadata**: The ledger metadata. + + - **compactedLedger**: The ledgers holding un-acked messages after topic compaction. + + - **ledgerId**: The ID of this ledger. + + - **entries**: The total number of entries that belong to this ledger. + + - **size**: The size of messages written to this ledger (in bytes). + + - **offloaded**: Whether this ledger is offloaded. The value is `false` for the compacted topic ledger. + + - **cursors**: The list of all cursors on this topic. Each subscription in the topic stats has a cursor. + + - **markDeletePosition**: All messages before the markDeletePosition are acknowledged by the subscriber. + + - **readPosition**: The latest position of subscriber for reading message. + + - **waitingReadOp**: This is true when the subscription has read the latest message published to the topic and is waiting for new messages to be published. + + - **pendingReadOps**: The counter for how many outstanding read requests to the BookKeepers in progress. + + - **messagesConsumedCounter**: The number of messages this cursor has acked since this broker loaded this topic. + + - **cursorLedger**: The ledger being used to persistently store the current markDeletePosition. + + - **cursorLedgerLastEntry**: The last entryid used to persistently store the current markDeletePosition. + + - **individuallyDeletedMessages**: If acknowledges are being done out of order, the ranges of messages acknowledged between the markDeletePosition and the read-position shows. + + - **lastLedgerSwitchTimestamp**: The last time the cursor ledger is rolled over. + + - **state**: The state of the cursor ledger: `Open` means you have a cursor ledger for saving updates of the markDeletePosition. + +The following is an example of the detailed statistics of a topic. + +```json + +{ + "entriesAddedCounter":0, + "numberOfEntries":0, + "totalSize":0, + "currentLedgerEntries":0, + "currentLedgerSize":0, + "lastLedgerCreatedTimestamp":"2021-01-22T21:12:14.868+08:00", + "lastLedgerCreationFailureTimestamp":null, + "waitingCursorsCount":0, + "pendingAddEntriesCount":0, + "lastConfirmedEntry":"3:-1", + "state":"LedgerOpened", + "ledgers":[ + { + "ledgerId":3, + "entries":0, + "size":0, + "offloaded":false, + "metadata":null + } + ], + "cursors":{ + "test":{ + "markDeletePosition":"3:-1", + "readPosition":"3:-1", + "waitingReadOp":false, + "pendingReadOps":0, + "messagesConsumedCounter":0, + "cursorLedger":4, + "cursorLedgerLastEntry":1, + "individuallyDeletedMessages":"[]", + "lastLedgerSwitchTimestamp":"2021-01-22T21:12:14.966+08:00", + "state":"Open", + "numberOfEntriesSinceFirstNotAckedMessage":0, + "totalNonContiguousDeletedMessagesRange":0, + "properties":{ + + } + } + }, + "schemaLedgers":[ + { + "ledgerId":1, + "entries":11, + "size":10, + "offloaded":false, + "metadata":null + } + ], + "compactedLedger":{ + "ledgerId":-1, + "entries":-1, + "size":-1, + "offloaded":false, + "metadata":null + } +} + +``` + +To get the internal status of a topic, you can use the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getInternalStats(topic); + +``` + + + + +```` + +### Peek messages + +You can peek a number of messages for a specific subscription of a given topic in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics peek-messages \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +Message ID: 315674752:0 +Properties: { "X-Pulsar-publish-time" : "2015-07-13 17:40:28.451" } +msg-payload + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/position/:messagePosition|operation/peekNthMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().peekMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Get message by ID + +You can fetch the message with the given ledger ID and entry ID in the following ways. + +````mdx-code-block + + + +```shell + +$ ./bin/pulsar-admin topics get-message-by-id \ + persistent://public/default/my-topic \ + -l 10 -e 0 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/ledger/:ledgerId/entry/:entryId|operation/getMessageById?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +long ledgerId = 10; +long entryId = 10; +admin.topics().getMessageById(topic, ledgerId, entryId); + +``` + + + + +```` + +### Examine messages + +You can examine a specific message on a topic by position relative to the earliest or the latest message. + +````mdx-code-block + + + +```shell + +./bin/pulsar-admin topics examine-messages \ + persistent://public/default/my-topic \ + -i latest -m 1 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/examinemessage?initialPosition=:initialPosition&messagePosition=:messagePosition|operation/examineMessage?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().examineMessage(topic, "latest", 1); + +``` + + + + +```` + +### Get message ID + +You can get message ID published at or just after the given datetime. + +````mdx-code-block + + + +```shell + +./bin/pulsar-admin topics get-message-id \ + persistent://public/default/my-topic \ + -d 2021-06-28T19:01:17Z + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/messageid/:timestamp|operation/getMessageIdByTimestamp?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +long timestamp = System.currentTimeMillis() +admin.topics().getMessageIdByTimestamp(topic, timestamp); + +``` + + + + +```` + + +### Skip messages + +You can skip a number of messages for a specific subscription of a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip/:numMessages|operation/skipMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.topics().skipMessages(topic, subName, numMessages); + +``` + + + + +```` + +### Skip all messages + +You can skip all the old messages for a specific subscription of a given topic. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics skip-all \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/skip_all|operation/skipAllMessages?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +admin.topics().skipAllMessages(topic, subName); + +``` + + + + +```` + +### Reset cursor + +You can reset a subscription cursor position back to the position which is recorded X minutes before. It essentially calculates time and position of cursor at X minutes before and resets it at that position. You can reset the cursor in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics reset-cursor \ + --subscription my-subscription --time 10 \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic/subscription/:subName/resetcursor/:timestamp|operation/resetCursor?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +long timestamp = 2342343L; +admin.topics().skipAllMessages(topic, subName, timestamp); + +``` + + + + +```` + +### Look up topic's owner broker + +You can locate the owner broker of the given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics lookup \ + persistent://test-tenant/ns1/tp1 \ + + "pulsar://broker1.org.com:4480" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic-domain/:tenant/:namespace/:topic|operation/lookupTopicAsync?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupDestination(topic); + +``` + + + + +```` + +### Get bundle + +You can get the range of the bundle that the given topic belongs to in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics bundle-range \ + persistent://test-tenant/ns1/tp1 \ + + "0x00000000_0xffffffff" + +``` + + + + +{@inject: endpoint|GET|/lookup/v2/topic/:topic_domain/:tenant/:namespace/:topic/bundle|operation/getNamespaceBundle?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().getBundleRange(topic); + +``` + + + + +```` + +### Get subscriptions + +You can check all subscription names for a given topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics subscriptions \ + persistent://test-tenant/ns1/tp1 \ + + my-subscription + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getSubscriptions(topic); + +``` + + + + +```` + +### Unsubscribe + +When a subscription does not process messages any more, you can unsubscribe it in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics unsubscribe \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/:topic/subscription/:subscription|operation/deleteSubscription?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.topics().deleteSubscription(topic, subscriptionName); + +``` + + + + +```` + +### Last Message Id + +You can get the last committed message ID for a persistent topic. It is available since 2.3.0 release. + +````mdx-code-block + + + +```shell + +pulsar-admin topics last-message-id topic-name + +``` + + + + +{@inject: endpoint|Get|/admin/v2/:schema/:tenant/:namespace/:topic/lastMessageId|operation/getLastMessageId?version=@pulsar:version_number@} + + + + +```Java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getLastMessage(topic); + +``` + + + + +```` + + +### Configure deduplication snapshot interval + +#### Get deduplication snapshot interval + +To get the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/getDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + +#### Set deduplication snapshot interval + +To set the topic-level deduplication snapshot interval, use one of the following methods. + +> **Prerequisite** `brokerDeduplicationEnabled` must be set to `true`. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/setDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setDeduplicationSnapshotInterval(topic, 1000) + +``` + + + + +```` + +#### Remove deduplication snapshot interval + +To remove the topic-level deduplication snapshot interval, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-deduplication-snapshot-interval options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/deduplicationSnapshotInterval|operation/deleteDeduplicationSnapshotInterval?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeDeduplicationSnapshotInterval(topic) + +``` + + + + +```` + + +### Configure inactive topic policies + +#### Get inactive topic policies + +To get the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/getInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInactiveTopicPolicies(topic) + +``` + + + + +```` + +#### Set inactive topic policies + +To set the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/setInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setInactiveTopicPolicies(topic, inactiveTopicPolicies) + +``` + + + + +```` + +#### Remove inactive topic policies + +To remove the topic-level inactive topic policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-inactive-topic-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/inactiveTopicPolicies|operation/removeInactiveTopicPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeInactiveTopicPolicies(topic) + +``` + + + + +```` + + +### Configure offload policies + +#### Get offload policies + +To get the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics get-offload-policies options + +``` + + + + +{@inject: endpoint|GET|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/getOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getOffloadPolicies(topic) + +``` + + + + +```` + +#### Set offload policies + +To set the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics set-offload-policies options + +``` + + + + +{@inject: endpoint|POST|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/setOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().setOffloadPolicies(topic, offloadPolicies) + +``` + + + + +```` + +#### Remove offload policies + +To remove the topic-level offload policies, use one of the following methods. + +````mdx-code-block + + + +``` + +pulsar-admin topics remove-offload-policies options + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/topics/:tenant/:namespace/:topic/offloadPolicies|operation/removeOffloadPolicies?version=@pulsar:version_number@} + + + + +```java + +admin.topics().removeOffloadPolicies(topic) + +``` + + + + +```` + + +## Manage non-partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, delete and check status of non-partitioned topics. + +### Create +Non-partitioned topics must be explicitly created. When creating a new non-partitioned topic, you need to provide a name for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create non-partitioned topics in the following ways. +````mdx-code-block + + + +When you create non-partitioned topics with the [`create`](reference-pulsar-admin.md#create-3) command, you need to specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create \ + persistent://my-tenant/my-namespace/my-topic + +``` + +:::note + +When you create a non-partitioned topic with the suffix '-partition-' followed by numeric value like 'xyz-topic-partition-x' for the topic name, if a partitioned topic with same suffix 'xyz-topic-partition-y' exists, then the numeric value(x) for the non-partitioned topic must be larger than the number of partitions(y) of the partitioned topic. Otherwise, you cannot create such a non-partitioned topic. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createNonPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createNonPartitionedTopic(topicName); + +``` + + + + +```` + +### Delete +You can delete non-partitioned topics in the following ways. +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:tenant/:namespace/:topic|operation/deleteTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List + +You can get the list of topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +```json + +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} + +``` + +You can check the current statistics of a given topic and its connected producers and consumers in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats \ + persistent://test-tenant/namespace/topic \ + --get-precise-backlog + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/stats|operation/getStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getStats(topic, false /* is precise backlog */); + +``` + + + + +```` + +## Manage partitioned topics +You can use Pulsar [admin API](admin-api-overview.md) to create, update, delete and check status of partitioned topics. + +### Create + +Partitioned topics must be explicitly created. When creating a new partitioned topic, you need to provide a name and the number of partitions for the topic. + +By default, 60 seconds after creation, topics are considered inactive and deleted automatically to avoid generating trash data. To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to a specific value. + +For more information about the two parameters, see [here](reference-configuration.md#broker). + +You can create partitioned topics in the following ways. +````mdx-code-block + + + +When you create partitioned topics with the [`create-partitioned-topic`](reference-pulsar-admin.md#create-partitioned-topic) +command, you need to specify the topic name as an argument and the number of partitions using the `-p` or `--partitions` flag. + +```shell + +$ bin/pulsar-admin topics create-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 + +``` + +:::note + +If a non-partitioned topic with the suffix '-partition-' followed by a numeric value like 'xyz-topic-partition-10', you can not create a partitioned topic with name 'xyz-topic', because the partitions of the partitioned topic could override the existing non-partitioned topic. To create such partitioned topic, you have to delete that non-partitioned topic first. + +::: + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.topics().createPartitionedTopic(topicName, numPartitions); + +``` + + + + +```` + +### Create missed partitions + +When topic auto-creation is disabled, and you have a partitioned topic without any partitions, you can use the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command to create partitions for the topic. + +````mdx-code-block + + + +You can create missed partitions with the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) command and specify the topic name as an argument. + +```shell + +$ bin/pulsar-admin topics create-missed-partitions \ + persistent://my-tenant/my-namespace/my-topic \ + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:namespace/:topic|operation/createMissedPartitions?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createMissedPartitions(topicName); + +``` + + + + +```` + +### Get metadata + +Partitioned topics are associated with metadata, you can view it as a JSON object. The following metadata field is available. + +Field | Description +:-----|:------- +`partitions` | The number of partitions into which the topic is divided. + +````mdx-code-block + + + +You can check the number of partitions in a partitioned topic with the [`get-partitioned-topic-metadata`](reference-pulsar-admin.md#get-partitioned-topic-metadata) subcommand. + +```shell + +$ pulsar-admin topics get-partitioned-topic-metadata \ + persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata?version=@pulsar:version_number@} + + + + +```java + +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().getPartitionedTopicMetadata(topicName); + +``` + + + + +```` + +### Update + +You can update the number of partitions for an existing partitioned topic *if* the topic is non-global. However, you can only add the partition number. Decrementing the number of partitions would delete the topic, which is not supported in Pulsar. + +Producers and consumers can find the newly created partitions automatically. + +````mdx-code-block + + + +You can update partitioned topics with the [`update-partitioned-topic`](reference-pulsar-admin.md#update-partitioned-topic) command. + +```shell + +$ pulsar-admin topics update-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 8 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/:schema/:tenant/:cluster/:namespace/:destination/partitions|operation/updatePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().updatePartitionedTopic(topic, numPartitions); + +``` + + + + +```` + +### Delete +You can delete partitioned topics with the [`delete-partitioned-topic`](reference-pulsar-admin.md#delete-partitioned-topic) command, REST API and Java. + +````mdx-code-block + + + +```shell + +$ bin/pulsar-admin topics delete-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/:schema/:topic/:namespace/:destination/partitions|operation/deletePartitionedTopic?version=@pulsar:version_number@} + + + + +```java + +admin.topics().delete(topic); + +``` + + + + +```` + +### List +You can get the list of topics under a given namespace in the following ways. +````mdx-code-block + + + +```shell + +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace|operation/getPartitionedTopicList?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getList(namespace); + +``` + + + + +```` + +### Stats + +You can check the current statistics of a given partitioned topic. The following is an example. For description of each stats, refer to [get stats](#get-stats). + +Note that in the subscription JSON object, `chuckedMessageRate` is deprecated. Please use `chunkedMessageRate`. Both will be sent in the JSON for now. + +```json + +{ + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesInCounter" : 270318763, + "msgInCounter" : 252489, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "averageMsgSize" : 1070.926056966454, + "msgChunkPublished" : false, + "storageSize" : 270316646, + "backlogSize" : 200921133, + "publishers" : [ { + "msgRateIn" : 999.992947159793, + "msgThroughputIn" : 1070918.4635439808, + "averageMsgSize" : 1070.3333333333333, + "chunkedMessageRate" : 0.0, + "producerId" : 0 + } ], + "subscriptions" : { + "test" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "bytesOutCounter" : 0, + "msgOutCounter" : 0, + "msgRateRedeliver" : 0.0, + "chuckedMessageRate" : 0, + "chunkedMessageRate" : 0, + "msgBacklog" : 144318, + "msgBacklogNoDelayed" : 144318, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "msgRateExpired" : 0.0, + "lastExpireTimestamp" : 0, + "lastConsumedFlowTimestamp" : 0, + "lastConsumedTimestamp" : 0, + "lastAckedTimestamp" : 0, + "consumers" : [ ], + "isDurable" : true, + "isReplicated" : false + } + }, + "replication" : { }, + "metadata" : { + "partitions" : 3 + }, + "partitions" : { } +} + +``` + +You can check the current statistics of a given partitioned topic and its connected producers and consumers in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics partitioned-stats \ + persistent://test-tenant/namespace/topic \ + --per-partition + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/partitioned-stats|operation/getPartitionedStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getPartitionedStats(topic, true /* per partition */, false /* is precise backlog */); + +``` + + + + +```` + +### Internal stats + +You can check the detailed statistics of a topic. The following is an example. For description of each stats, refer to [get internal stats](#get-internal-stats). + +```json + +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} + +``` + +You can get the internal stats for the partitioned topic in the following ways. + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/namespace/topic + +``` + + + + +{@inject: endpoint|GET|/admin/v2/:schema/:tenant/:namespace/:topic/internalStats|operation/getInternalStats?version=@pulsar:version_number@} + + + + +```java + +admin.topics().getInternalStats(topic); + +``` + + + + +```` + +### Get backlog size + +You can get backlog size of a single topic partition or a nonpartitioned topic given a message ID (in bytes). + +````mdx-code-block + + + +```shell + +$ pulsar-admin topics get-backlog-size \ + -m 1:1 \ + persistent://test-tenant/ns1/tp1-partition-0 \ + +``` + + + + +{@inject: endpoint|PUT|/admin/v2/:schema/:tenant/:namespace/:topic/backlogSize|operation/getBacklogSizeByMessageId?version=@pulsar:version_number@} + + + + +```java + +String topic = "persistent://my-tenant/my-namespace/my-topic"; +MessageId messageId = MessageId.earliest; +admin.topics().getBacklogSizeByMessageId(topic, messageId); + +``` + + + + +```` + +## Publish to partitioned topics + +By default, Pulsar topics are served by a single broker, which limits the maximum throughput of a topic. *Partitioned topics* can span multiple brokers and thus allow for higher throughput. + +You can publish to partitioned topics using Pulsar client libraries. When publishing to partitioned topics, you must specify a routing mode. If you do not specify any routing mode when you create a new producer, the round robin routing mode is used. + +### Routing mode + +You can specify the routing mode in the ProducerConfiguration object that you use to configure your producer. The routing mode determines which partition(internal topic) that each message should be published to. + +The following {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} options are available. + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer publishes messages across all partitions in round-robin policy to achieve the maximum throughput. Round-robin is not done per individual message, round-robin is set to the same boundary of batching delay to ensure that batching is effective. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer picks a single partition randomly and publishes all messages into that partition. If a key is specified on the message, the partitioned producer hashes the key and assigns message to a particular partition. +`CustomPartition` | Use custom message router implementation that is called to determine the partition for a particular message. You can create a custom routing mode by using the Java client and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +The following is an example: + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRoutingMode(MessageRoutingMode.SinglePartition) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### Custom message router + +To use a custom message router, you need to provide an implementation of the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface, which has just one `choosePartition` method: + +```java + +public interface MessageRouter extends Serializable { + int choosePartition(Message msg); +} + +``` + +The following router routes every message to partition 10: + +```java + +public class AlwaysTenRouter implements MessageRouter { + public int choosePartition(Message msg) { + return 10; + } +} + +``` + +With that implementation, you can send + +```java + +String pulsarBrokerRootUrl = "pulsar://localhost:6650"; +String topic = "persistent://my-tenant/my-cluster-my-namespace/my-topic"; + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl(pulsarBrokerRootUrl).build(); +Producer producer = pulsarClient.newProducer() + .topic(topic) + .messageRouter(new AlwaysTenRouter()) + .create(); +producer.send("Partitioned topic message".getBytes()); + +``` + +### How to choose partitions when using a key +If a message has a key, it supersedes the round robin routing policy. The following example illustrates how to choose the partition when using a key. + +```java + +// If the message has a key, it supersedes the round robin routing policy + if (msg.hasKey()) { + return signSafeMod(hash.makeHash(msg.getKey()), topicMetadata.numPartitions()); + } + + if (isBatchingEnabled) { // if batching is enabled, choose partition on `partitionSwitchMs` boundary. + long currentMs = clock.millis(); + return signSafeMod(currentMs / partitionSwitchMs + startPtnIdx, topicMetadata.numPartitions()); + } else { + return signSafeMod(PARTITION_INDEX_UPDATER.getAndIncrement(this), topicMetadata.numPartitions()); + } + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/administration-dashboard.md b/site2/website/versioned_docs/version-2.9.x/administration-dashboard.md new file mode 100644 index 0000000000000..92bd7e17869d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-dashboard.md @@ -0,0 +1,76 @@ +--- +id: administration-dashboard +title: Pulsar dashboard +sidebar_label: "Dashboard" +original_id: administration-dashboard +--- + +:::note + +Pulsar dashboard is deprecated. We recommend you use [Pulsar Manager](administration-pulsar-manager.md) to manage and monitor the stats of your topics. + +::: + +Pulsar dashboard is a web application that enables users to monitor current stats for all [topics](reference-terminology.md#topic) in tabular form. + +The dashboard is a data collector that polls stats from all the brokers in a Pulsar instance (across multiple clusters) and stores all the information in a [PostgreSQL](https://www.postgresql.org/) database. + +You can use the [Django](https://www.djangoproject.com) web app to render the collected data. + +## Install + +The easiest way to use the dashboard is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell + +$ SERVICE_URL=http://broker.example.com:8080/ +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + apachepulsar/pulsar-dashboard:@pulsar:version@ + +``` + +You can find the {@inject: github:Dockerfile:/dashboard/Dockerfile} in the `dashboard` directory and build an image from scratch as well: + +```shell + +$ docker build -t apachepulsar/pulsar-dashboard dashboard + +``` + +If token authentication is enabled: +> Provided token should have super-user access. + +```shell + +$ SERVICE_URL=http://broker.example.com:8080/ +$ JWT_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + -e JWT_TOKEN=$JWT_TOKEN \ + apachepulsar/pulsar-dashboard + +``` + + +You need to specify only one service URL for a Pulsar cluster. Internally, the collector figures out all the existing clusters and the brokers from where it needs to pull the metrics. If you connect the dashboard to Pulsar running in standalone mode, the URL is `http://:8080` by default. `` is the IP address or hostname of the machine that runs Pulsar standalone. The IP address or hostname should be accessible from the running dashboard in the docker instance. + +Once the Docker container starts, the web dashboard is accessible via `localhost` or whichever host that Docker uses. + +> The `SERVICE_URL` that the dashboard uses needs to be reachable from inside the Docker container. + +If the Pulsar service runs in standalone mode in `localhost`, the `SERVICE_URL` has to +be the IP address of the machine. + +Similarly, given the Pulsar standalone advertises itself with localhost by default, you need to +explicitly set the advertise address to the host IP address. For example: + +```shell + +$ bin/pulsar standalone --advertised-address 1.2.3.4 + +``` + +### Known issues + +Currently, only Pulsar Token [authentication](security-overview.md#authentication-providers) is supported. diff --git a/site2/website/versioned_docs/version-2.9.x/administration-geo.md b/site2/website/versioned_docs/version-2.9.x/administration-geo.md new file mode 100644 index 0000000000000..f1b988dd5f13b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-geo.md @@ -0,0 +1,214 @@ +--- +id: administration-geo +title: Pulsar geo-replication +sidebar_label: "Geo-replication" +original_id: administration-geo +--- + +*Geo-replication* is the replication of persistently stored message data across multiple clusters of a Pulsar instance. + +## How geo-replication works + +The diagram below illustrates the process of geo-replication across Pulsar clusters: + +![Replication Diagram](/assets/geo-replication.png) + +In this diagram, whenever **P1**, **P2**, and **P3** producers publish messages to the **T1** topic on **Cluster-A**, **Cluster-B**, and **Cluster-C** clusters respectively, those messages are instantly replicated across clusters. Once the messages are replicated, **C1** and **C2** consumers can consume those messages from their respective clusters. + +Without geo-replication, **C1** and **C2** consumers are not able to consume messages that **P3** producer publishes. + +## Geo-replication and Pulsar properties + +You must enable geo-replication on a per-tenant basis in Pulsar. You can enable geo-replication between clusters only when a tenant is created that allows access to both clusters. + +Although geo-replication must be enabled between two clusters, actually geo-replication is managed at the namespace level. You must complete the following tasks to enable geo-replication for a namespace: + +* [Enable geo-replication namespaces](#enable-geo-replication-namespaces) +* Configure that namespace to replicate across two or more provisioned clusters + +Any message published on *any* topic in that namespace is replicated to all clusters in the specified set. + +## Local persistence and forwarding + +When messages are produced on a Pulsar topic, messages are first persisted in the local cluster, and then forwarded asynchronously to the remote clusters. + +In normal cases, when connectivity issues are none, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, the network [round-trip time](https://en.wikipedia.org/wiki/Round-trip_delay_time) (RTT) between the remote regions defines end-to-end delivery latency. + +Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (like during a network partition). + +Producers and consumers can publish messages to and consume messages from any cluster in a Pulsar instance. However, subscriptions cannot only be local to the cluster where the subscriptions are created but also can be transferred between clusters after replicated subscription is enabled. Once replicated subscription is enabled, you can keep subscription state in synchronization. Therefore, a topic can be asynchronously replicated across multiple geographical regions. In case of failover, a consumer can restart consuming messages from the failure point in a different cluster. + +In the aforementioned example, the **T1** topic is replicated among three clusters, **Cluster-A**, **Cluster-B**, and **Cluster-C**. + +All messages produced in any of the three clusters are delivered to all subscriptions in other clusters. In this case, **C1** and **C2** consumers receive all messages that **P1**, **P2**, and **P3** producers publish. Ordering is still guaranteed on a per-producer basis. + +## Configure replication + +As stated in [Geo-replication and Pulsar properties](#geo-replication-and-pulsar-properties) section, geo-replication in Pulsar is managed at the [tenant](reference-terminology.md#tenant) level. + +The following example connects three clusters: **us-east**, **us-west**, and **us-cent**. + +### Connect replication clusters + +To replicate data among clusters, you need to configure each cluster to connect to the other. You can use the [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) tool to create a connection. + +**Example** + +Suppose that you have 3 replication clusters: `us-west`, `us-cent`, and `us-east`. + +1. Configure the connection from `us-west` to `us-east`. + + Run the following command on `us-west`. + +```shell + +$ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-east + +``` + + :::tip + + - If you want to use a secure connection for a cluster, you can use the flags `--broker-url-secure` and `--url-secure`. For more information, see [pulsar-admin clusters create](https://pulsar.apache.org/tools/pulsar-admin/). + - Different clusters may have different authentications. You can use the authentication flag `--auth-plugin` and `--auth-parameters` together to set cluster authentication, which overrides `brokerClientAuthenticationPlugin` and `brokerClientAuthenticationParameters` if `authenticationEnabled` sets to `true` in `broker.conf` and `standalone.conf`. For more information, see [authentication and authorization](concepts-authentication.md). + + ::: + +2. Configure the connection from `us-west` to `us-cent`. + + Run the following command on `us-west`. + +```shell + +$ bin/pulsar-admin clusters create \ + --broker-url pulsar://: \ + --url http://: \ + us-cent + +``` + +3. Run similar commands on `us-east` and `us-cent` to create connections among clusters. + +### Grant permissions to properties + +To replicate to a cluster, the tenant needs permission to use that cluster. You can grant permission to the tenant when you create the tenant or grant later. + +Specify all the intended clusters when you create a tenant: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east,us-cent + +``` + +To update permissions of an existing tenant, use `update` instead of `create`. + +### Enable geo-replication namespaces + +You can create a namespace with the following command sample. + +```shell + +$ bin/pulsar-admin namespaces create my-tenant/my-namespace + +``` + +Initially, the namespace is not assigned to any cluster. You can assign the namespace to clusters using the `set-clusters` subcommand: + +```shell + +$ bin/pulsar-admin namespaces set-clusters my-tenant/my-namespace \ + --clusters us-west,us-east,us-cent + +``` + +You can change the replication clusters for a namespace at any time, without disruption to ongoing traffic. Replication channels are immediately set up or stopped in all clusters as soon as the configuration changes. + +### Use topics with geo-replication + +Once you create a geo-replication namespace, any topics that producers or consumers create within that namespace is replicated across clusters. Typically, each application uses the `serviceUrl` for the local cluster. + +#### Selective replication + +By default, messages are replicated to all clusters configured for the namespace. You can restrict replication selectively by specifying a replication list for a message, and then that message is replicated only to the subset in the replication list. + +The following is an example for the [Java API](client-libraries-java.md). Note the use of the `setReplicationClusters` method when you construct the {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} object: + +```java + +List restrictReplicationTo = Arrays.asList( + "us-west", + "us-east" +); + +Producer producer = client.newProducer() + .topic("some-topic") + .create(); + +producer.newMessage() + .value("my-payload".getBytes()) + .setReplicationClusters(restrictReplicationTo) + .send(); + +``` + +#### Topic stats + +Topic-specific statistics for geo-replication topics are available via the [`pulsar-admin`](reference-pulsar-admin.md) tool and {@inject: rest:REST:/} API: + +```shell + +$ bin/pulsar-admin persistent stats persistent://my-tenant/my-namespace/my-topic + +``` + +Each cluster reports its own local stats, including the incoming and outgoing replication rates and backlogs. + +#### Delete a geo-replication topic + +Given that geo-replication topics exist in multiple regions, directly deleting a geo-replication topic is not possible. Instead, you should rely on automatic topic garbage collection. + +In Pulsar, a topic is automatically deleted when the topic meets the following three conditions: +- no producers or consumers are connected to it; +- no subscriptions to it; +- no more messages are kept for retention. +For geo-replication topics, each region uses a fault-tolerant mechanism to decide when deleting the topic locally is safe. + +You can explicitly disable topic garbage collection by setting `brokerDeleteInactiveTopicsEnabled` to `false` in your [broker configuration](reference-configuration.md#broker). + +To delete a geo-replication topic, close all producers and consumers on the topic, and delete all of its local subscriptions in every replication cluster. When Pulsar determines that no valid subscription for the topic remains across the system, it will garbage collect the topic. + +## Replicated subscriptions + +Pulsar supports replicated subscriptions, so you can keep subscription state in sync, within a sub-second timeframe, in the context of a topic that is being asynchronously replicated across multiple geographical regions. + +In case of failover, a consumer can restart consuming from the failure point in a different cluster. + +### Enable replicated subscription + +Replicated subscription is disabled by default. You can enable replicated subscription when creating a consumer. + +```java + +Consumer consumer = client.newConsumer(Schema.STRING) + .topic("my-topic") + .subscriptionName("my-subscription") + .replicateSubscriptionState(true) + .subscribe(); + +``` + +### Advantages + + * It is easy to implement the logic. + * You can choose to enable or disable replicated subscription. + * When you enable it, the overhead is low, and it is easy to configure. + * When you disable it, the overhead is zero. + +### Limitations + +When you enable replicated subscription, you're creating a consistent distributed snapshot to establish an association between message ids from different clusters. The snapshots are taken periodically. The default value is `1 second`. It means that a consumer failing over to a different cluster can potentially receive 1 second of duplicates. You can also configure the frequency of the snapshot in the `broker.conf` file. diff --git a/site2/website/versioned_docs/version-2.9.x/administration-isolation.md b/site2/website/versioned_docs/version-2.9.x/administration-isolation.md new file mode 100644 index 0000000000000..d2de042a2e741 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-isolation.md @@ -0,0 +1,115 @@ +--- +id: administration-isolation +title: Pulsar isolation +sidebar_label: "Pulsar isolation" +original_id: administration-isolation +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +In an organization, a Pulsar instance provides services to multiple teams. When organizing the resources across multiple teams, you want to make a suitable isolation plan to avoid the resource competition between different teams and applications and provide high-quality messaging service. In this case, you need to take resource isolation into consideration and weigh your intended actions against expected and unexpected consequences. + +To enforce resource isolation, you can use the Pulsar isolation policy, which allows you to allocate resources (**broker** and **bookie**) for the namespace. + +## Broker isolation + +In Pulsar, when namespaces (more specifically, namespace bundles) are assigned dynamically to brokers, the namespace isolation policy limits the set of brokers that can be used for assignment. Before topics are assigned to brokers, you can set the namespace isolation policy with a primary or a secondary regex to select desired brokers. + +You can set a namespace isolation policy for a cluster using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin ns-isolation-policy set options + +``` + +For more information about the command `pulsar-admin ns-isolation-policy set options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin ns-isolation-policy set \ +--auto-failover-policy-type min_available \ +--auto-failover-policy-params min_limit=1,usage_threshold=80 \ +--namespaces my-tenant/my-namespace \ +--primary 10.193.216.* my-cluster policy-name + +``` + + + + +[PUT /admin/v2/namespaces/{tenant}/{namespace}](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/createNamespace) + + + + +For how to set namespace isolation policy using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L251). + + + + +```` + +## Bookie isolation + +A namespace can be isolated into user-defined groups of bookies, which guarantees all the data that belongs to the namespace is stored in desired bookies. The bookie affinity group uses the BookKeeper [rack-aware placement policy](https://bookkeeper.apache.org/docs/latest/api/javadoc/org/apache/bookkeeper/client/EnsemblePlacementPolicy.html) and it is a way to feed rack information which is stored as JSON format in znode. + +You can set a bookie affinity group using one of the following methods. + +````mdx-code-block + + + + +``` + +pulsar-admin namespaces set-bookie-affinity-group options + +``` + +For more information about the command `pulsar-admin namespaces set-bookie-affinity-group options`, see [here](https://pulsar.apache.org/tools/pulsar-admin/). + +**Example** + +```shell + +bin/pulsar-admin bookies set-bookie-rack \ +--bookie 127.0.0.1:3181 \ +--hostname 127.0.0.1:3181 \ +--group group-bookie1 \ +--rack rack1 + +bin/pulsar-admin namespaces set-bookie-affinity-group public/default \ +--primary-group group-bookie1 + +``` + + + + +[POST /admin/v2/namespaces/{tenant}/{namespace}/persistence/bookieAffinity](https://pulsar.apache.org/admin-rest-api/?version=master&apiversion=v2#operation/setBookieAffinityGroup) + + + + +For how to set bookie affinity group for a namespace using Java admin API, see [here](https://github.com/apache/pulsar/blob/master/pulsar-client-admin/src/main/java/org/apache/pulsar/client/admin/internal/NamespacesImpl.java#L1164). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/administration-load-balance.md b/site2/website/versioned_docs/version-2.9.x/administration-load-balance.md new file mode 100644 index 0000000000000..788c84a59317b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-load-balance.md @@ -0,0 +1,250 @@ +--- +id: administration-load-balance +title: Pulsar load balance +sidebar_label: "Load balance" +original_id: administration-load-balance +--- + +## Load balance across Pulsar brokers + +Pulsar is an horizontally scalable messaging system, so the traffic in a logical cluster must be balanced across all the available Pulsar brokers as evenly as possible, which is a core requirement. + +You can use multiple settings and tools to control the traffic distribution which require a bit of context to understand how the traffic is managed in Pulsar. Though, in most cases, the core requirement mentioned above is true out of the box and you should not worry about it. + +## Pulsar load manager architecture + +The following part introduces the basic architecture of the Pulsar load manager. + +### Assign topics to brokers dynamically + +Topics are dynamically assigned to brokers based on the load conditions of all brokers in the cluster. + +When a client starts using new topics that are not assigned to any broker, a process is triggered to choose the best suited broker to acquire ownership of these topics according to the load conditions. + +In case of partitioned topics, different partitions are assigned to different brokers. Here "topic" means either a non-partitioned topic or one partition of a topic. + +The assignment is "dynamic" because the assignment changes quickly. For example, if the broker owning the topic crashes, the topic is reassigned immediately to another broker. Another scenario is that the broker owning the topic becomes overloaded. In this case, the topic is reassigned to a less loaded broker. + +The stateless nature of brokers makes the dynamic assignment possible, so you can quickly expand or shrink the cluster based on usage. + +#### Assignment granularity + +The assignment of topics or partitions to brokers is not done at the topics or partitions level, but done at the Bundle level (a higher level). The reason is to amortize the amount of information that you need to keep track. Based on CPU, memory, traffic load and other indexes, topics are assigned to a particular broker dynamically. + +Instead of individual topic or partition assignment, each broker takes ownership of a subset of the topics for a namespace. This subset is called a "*bundle*" and effectively this subset is a sharding mechanism. + +The namespace is the "administrative" unit: many config knobs or operations are done at the namespace level. + +For assignment, a namespaces is sharded into a list of "bundles", with each bundle comprising a portion of overall hash range of the namespace. + +Topics are assigned to a particular bundle by taking the hash of the topic name and checking in which bundle the hash falls into. + +Each bundle is independent of the others and thus is independently assigned to different brokers. + +### Create namespaces and bundles + +When you create a new namespace, the new namespace sets to use the default number of bundles. You can set this in `conf/broker.conf`: + +```properties + +# When a namespace is created without specifying the number of bundle, this +# value will be used as the default +defaultNumberOfNamespaceBundles=4 + +``` + +You can either change the system default, or override it when you create a new namespace: + +```shell + +$ bin/pulsar-admin namespaces create my-tenant/my-namespace --clusters us-west --bundles 16 + +``` + +With this command, you create a namespace with 16 initial bundles. Therefore the topics for this namespaces can immediately be spread across up to 16 brokers. + +In general, if you know the expected traffic and number of topics in advance, you had better start with a reasonable number of bundles instead of waiting for the system to auto-correct the distribution. + +On the same note, it is beneficial to start with more bundles than the number of brokers, because of the hashing nature of the distribution of topics into bundles. For example, for a namespace with 1000 topics, using something like 64 bundles achieves a good distribution of traffic across 16 brokers. + +### Unload topics and bundles + +You can "unload" a topic in Pulsar with admin operation. Unloading means to close the topics, release ownership and reassign the topics to a new broker, based on current load. + +When unloading happens, the client experiences a small latency blip, typically in the order of tens of milliseconds, while the topic is reassigned. + +Unloading is the mechanism that the load-manager uses to perform the load shedding, but you can also trigger the unloading manually, for example to correct the assignments and redistribute traffic even before having any broker overloaded. + +Unloading a topic has no effect on the assignment, but just closes and reopens the particular topic: + +```shell + +pulsar-admin topics unload persistent://tenant/namespace/topic + +``` + +To unload all topics for a namespace and trigger reassignments: + +```shell + +pulsar-admin namespaces unload tenant/namespace + +``` + +### Split namespace bundles + +Since the load for the topics in a bundle might change over time and predicting the load might be hard, bundle split is designed to deal with these issues. The broker splits a bundle into two and the new smaller bundles can be reassigned to different brokers. + +The splitting is based on some tunable thresholds. Any existing bundle that exceeds any of the threshold is a candidate to be split. By default the newly split bundles are also immediately offloaded to other brokers, to facilitate the traffic distribution. + +You can split namespace bundles in two ways, by setting `supportedNamespaceBundleSplitAlgorithms` to `range_equally_divide` or `topic_count_equally_divide` in `broker.conf` file. The former splits the bundle into two parts with the same hash range size; the latter splits the bundle into two parts with the same number of topics. You can also configure other parameters for namespace bundles. + +```properties + +# enable/disable namespace bundle auto split +loadBalancerAutoBundleSplitEnabled=true + +# enable/disable automatic unloading of split bundles +loadBalancerAutoUnloadSplitBundlesEnabled=true + +# maximum topics in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxTopics=1000 + +# maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxSessions=1000 + +# maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxMsgRate=30000 + +# maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxBandwidthMbytes=100 + +# maximum number of bundles in a namespace (for auto-split) +loadBalancerNamespaceMaximumBundles=128 + +``` + +### Shed load automatically + +The support for automatic load shedding is available in the load manager of Pulsar. This means that whenever the system recognizes a particular broker is overloaded, the system forces some traffic to be reassigned to less loaded brokers. + +When a broker is identified as overloaded, the broker forces to "unload" a subset of the bundles, the ones with higher traffic, that make up for the overload percentage. + +For example, the default threshold is 85% and if a broker is over quota at 95% CPU usage, then the broker unloads the percent difference plus a 5% margin: `(95% - 85%) + 5% = 15%`. + +Given the selection of bundles to offload is based on traffic (as a proxy measure for cpu, network and memory), broker unloads bundles for at least 15% of traffic. + +The automatic load shedding is enabled by default and you can disable the automatic load shedding with this setting: + +```properties + +# Enable/disable automatic bundle unloading for load-shedding +loadBalancerSheddingEnabled=true + +``` + +Additional settings that apply to shedding: + +```properties + +# Load shedding interval. Broker periodically checks whether some traffic should be offload from +# some over-loaded broker to other under-loaded brokers +loadBalancerSheddingIntervalMinutes=1 + +# Prevent the same topics to be shed and moved to other brokers more that once within this timeframe +loadBalancerSheddingGracePeriodMinutes=30 + +``` + +#### Broker overload thresholds + +The determinations of when a broker is overloaded is based on threshold of CPU, network and memory usage. Whenever either of those metrics reaches the threshold, the system triggers the shedding (if enabled). + +By default, overload threshold is set at 85%: + +```properties + +# Usage threshold to determine a broker as over-loaded +loadBalancerBrokerOverloadedThresholdPercentage=85 + +``` + +Pulsar gathers the usage stats from the system metrics. + +In case of network utilization, in some cases the network interface speed that Linux reports is not correct and needs to be manually overridden. This is the case in AWS EC2 instances with 1Gbps NIC speed for which the OS reports 10Gbps speed. + +Because of the incorrect max speed, the Pulsar load manager might think the broker has not reached the NIC capacity, while in fact the broker already uses all the bandwidth and the traffic is slowed down. + +You can use the following setting to correct the max NIC speed: + +```properties + +# Override the auto-detection of the network interfaces max speed. +# This option is useful in some environments (eg: EC2 VMs) where the max speed +# reported by Linux is not reflecting the real bandwidth available to the broker. +# Since the network usage is employed by the load manager to decide when a broker +# is overloaded, it is important to make sure the info is correct or override it +# with the right value here. The configured value can be a double (eg: 0.8) and that +# can be used to trigger load-shedding even before hitting on NIC limits. +loadBalancerOverrideBrokerNicSpeedGbps= + +``` + +When the value is empty, Pulsar uses the value that the OS reports. + +### Distribute anti-affinity namespaces across failure domains + +When your application has multiple namespaces and you want one of them available all the time to avoid any downtime, you can group these namespaces and distribute them across different [failure domains](reference-terminology.md#failure-domain) and different brokers. Thus, if one of the failure domains is down (due to release rollout or brokers restart), it only disrupts namespaces owned by that specific failure domain and the rest of the namespaces owned by other domains remain available without any impact. + +Such a group of namespaces has anti-affinity to each other, that is, all the namespaces in this group are [anti-affinity namespaces](reference-terminology.md#anti-affinity-namespaces) and are distributed to different failure domains in a load-balanced manner. + +As illustrated in the following figure, Pulsar has 2 failure domains (Domain1 and Domain2) and each domain has 2 brokers in it. You can create an anti-affinity namespace group that has 4 namespaces in it, and all the 4 namespaces have anti-affinity to each other. The load manager tries to distribute namespaces evenly across all the brokers in the same domain. Since each domain has 2 brokers, every broker owns one namespace from this anti-affinity namespace group, and you can see each domain owns 2 namespaces, and each broker owns 1 namespace. + +![Distribute anti-affinity namespaces across failure domains](/assets/anti-affinity-namespaces-across-failure-domains.svg) + +The load manager follows an even distribution policy across failure domains to assign anti-affinity namespaces. The following table outlines the even-distributed assignment sequence illustrated in the above figure. + +| Assignment sequence | Namespace | Failure domain candidates | Broker candidates | Selected broker | +|:---|:------------|:------------------|:------------------------------------|:-----------------| +| 1 | Namespace1 | Domain1, Domain2 | Broker1, Broker2, Broker3, Broker4 | Domain1:Broker1 | +| 2 | Namespace2 | Domain2 | Broker3, Broker4 | Domain2:Broker3 | +| 3 | Namespace3 | Domain1, Domain2 | Broker2, Broker4 | Domain1:Broker2 | +| 4 | Namespace4 | Domain2 | Broker4 | Domain2:Broker4 | + +:::tip + +* Each namespace belongs to only one anti-affinity group. If a namespace with an existing anti-affinity assignment is assigned to another anti-affinity group, the original assignment is dropped. + +* If there are more anti-affinity namespaces than failure domains, the load manager distributes namespaces evenly across all the domains, and also every domain distributes namespaces evenly across all the brokers under that domain. + +::: + +#### Create a failure domain and register brokers + +:::note + +One broker can only be registered to a single failure domain. + +::: + +To create a domain under a specific cluster and register brokers, run the following command: + +```bash + +pulsar-admin clusters create-failure-domain --domain-name --broker-list + +``` + +You can also view, update, and delete domains under a specific cluster. For more information, refer to [Pulsar admin doc](/tools/pulsar-admin/). + +#### Create an anti-affinity namespace group + +An anti-affinity group is created automatically when the first namespace is assigned to the group. To assign a namespace to an anti-affinity group, run the following command. It sets an anti-affinity group name for a namespace. + +```bash + +pulsar-admin namespaces set-anti-affinity-group --group + +``` + +For more information about `anti-affinity-group` related commands, refer to [Pulsar admin doc](/tools/pulsar-admin/). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/administration-proxy.md b/site2/website/versioned_docs/version-2.9.x/administration-proxy.md new file mode 100644 index 0000000000000..1657e4f88ce82 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-proxy.md @@ -0,0 +1,86 @@ +--- +id: administration-proxy +title: Pulsar proxy +sidebar_label: "Pulsar proxy" +original_id: administration-proxy +--- + +Pulsar proxy is an optional gateway. Pulsar proxy is used when direct connections between clients and Pulsar brokers are either infeasible or undesirable. For example, when you run Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, you can run Pulsar proxy. + +## Configure the proxy + +Before using the proxy, you need to configure it with the brokers addresses in the cluster. You can configure the proxy to connect directly to service discovery, or specify a broker URL in the configuration. + +### Use service discovery + +Pulsar uses [ZooKeeper](https://zookeeper.apache.org) for service discovery. To connect the proxy to ZooKeeper, specify the following in `conf/proxy.conf`. + +```properties + +zookeeperServers=zk-0,zk-1,zk-2 +configurationStoreServers=zk-0:2184,zk-remote:2184 + +``` + +> To use service discovery, you need to open the network ACLs, so the proxy can connects to the ZooKeeper nodes through the ZooKeeper client port (port `2181`) and the configuration store client port (port `2184`). + +> However, it is not secure to use service discovery. Because if the network ACL is open, when someone compromises a proxy, they have full access to ZooKeeper. + +### Use broker URLs + +It is more secure to specify a URL to connect to the brokers. + +Proxy authorization requires access to ZooKeeper, so if you use these broker URLs to connect to the brokers, you need to disable authorization at the Proxy level. Brokers still authorize requests after the proxy forwards them. + +You can configure the broker URLs in `conf/proxy.conf` as follows. + +```properties + +brokerServiceURL=pulsar://brokers.example.com:6650 +brokerWebServiceURL=http://brokers.example.com:8080 +functionWorkerWebServiceURL=http://function-workers.example.com:8080 + +``` + +If you use TLS, configure the broker URLs in the following way: + +```properties + +brokerServiceURLTLS=pulsar+ssl://brokers.example.com:6651 +brokerWebServiceURLTLS=https://brokers.example.com:8443 +functionWorkerWebServiceURL=https://function-workers.example.com:8443 + +``` + +The hostname in the URLs provided should be a DNS entry which points to multiple brokers or a virtual IP address, which is backed by multiple broker IP addresses, so that the proxy does not lose connectivity to Pulsar cluster if a single broker becomes unavailable. + +The ports to connect to the brokers (6650 and 8080, or in the case of TLS, 6651 and 8443) should be open in the network ACLs. + +Note that if you do not use functions, you do not need to configure `functionWorkerWebServiceURL`. + +## Start the proxy + +To start the proxy: + +```bash + +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy + +``` + +> You can run multiple instances of the Pulsar proxy in a cluster. + +## Stop the proxy + +Pulsar proxy runs in the foreground by default. To stop the proxy, simply stop the process in which the proxy is running. + +## Proxy frontends + +You can run Pulsar proxy behind some kind of load-distributing frontend, such as an [HAProxy](https://www.digitalocean.com/community/tutorials/an-introduction-to-haproxy-and-load-balancing-concepts) load balancer. + +## Use Pulsar clients with the proxy + +Once your Pulsar proxy is up and running, preferably behind a load-distributing [frontend](#proxy-frontends), clients can connect to the proxy via whichever address that the frontend uses. If the address is the DNS address `pulsar.cluster.default`, for example, the connection URL for clients is `pulsar://pulsar.cluster.default:6650`. + +For more information on Proxy configuration, refer to [Pulsar proxy](reference-configuration.md#pulsar-proxy). diff --git a/site2/website/versioned_docs/version-2.9.x/administration-pulsar-manager.md b/site2/website/versioned_docs/version-2.9.x/administration-pulsar-manager.md new file mode 100644 index 0000000000000..d877cce723e6a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-pulsar-manager.md @@ -0,0 +1,205 @@ +--- +id: administration-pulsar-manager +title: Pulsar Manager +sidebar_label: "Pulsar Manager" +original_id: administration-pulsar-manager +--- + +Pulsar Manager is a web-based GUI management and monitoring tool that helps administrators and users manage and monitor tenants, namespaces, topics, subscriptions, brokers, clusters, and so on, and supports dynamic configuration of multiple environments. + +:::note + +If you are monitoring your current stats with Pulsar dashboard, we recommend you use Pulsar Manager instead. Pulsar dashboard is deprecated. + +::: + +## Install + +The easiest way to use the Pulsar Manager is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell + +docker pull apachepulsar/pulsar-manager:v0.2.0 +docker run -it \ + -p 9527:9527 -p 7750:7750 \ + -e SPRING_CONFIGURATION_FILE=/pulsar-manager/pulsar-manager/application.properties \ + apachepulsar/pulsar-manager:v0.2.0 + +``` + +* `SPRING_CONFIGURATION_FILE`: Default configuration file for spring. + +### Set administrator account and password + + ```shell + + CSRF_TOKEN=$(curl http://localhost:7750/pulsar-manager/csrf-token) + curl \ + -H 'X-XSRF-TOKEN: $CSRF_TOKEN' \ + -H 'Cookie: XSRF-TOKEN=$CSRF_TOKEN;' \ + -H "Content-Type: application/json" \ + -X PUT http://localhost:7750/pulsar-manager/users/superuser \ + -d '{"name": "admin", "password": "apachepulsar", "description": "test", "email": "username@test.org"}' + + ``` + +You can find the docker image in the [Docker Hub](https://github.com/apache/pulsar-manager/tree/master/docker) directory and build an image from the source code as well: + +``` + +git clone https://github.com/apache/pulsar-manager +cd pulsar-manager/front-end +npm install --save +npm run build:prod +cd .. +./gradlew build -x test +cd .. +docker build -f docker/Dockerfile --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` --build-arg VCS_REF=`latest` --build-arg VERSION=`latest` -t apachepulsar/pulsar-manager . + +``` + +### Use custom databases + +If you have a large amount of data, you can use a custom database. The following is an example of PostgreSQL. + +1. Initialize database and table structures using the [file](https://github.com/apache/pulsar-manager/tree/master/src/main/resources/META-INF/sql/postgresql-schema.sql). + +2. Modify the [configuration file](https://github.com/apache/pulsar-manager/blob/master/src/main/resources/application.properties) and add PostgreSQL configuration. + +``` + +spring.datasource.driver-class-name=org.postgresql.Driver +spring.datasource.url=jdbc:postgresql://127.0.0.1:5432/pulsar_manager +spring.datasource.username=postgres +spring.datasource.password=postgres + +``` + +3. Compile to generate a new executable jar package. + +``` + +./gradlew build -x test + +``` + +### Enable JWT authentication + +If you want to turn on JWT authentication, configure the following parameters: + +* `backend.jwt.token`: token for the superuser. You need to configure this parameter during cluster initialization. +* `jwt.broker.token.mode`: multiple modes of generating token, including PUBLIC, PRIVATE, and SECRET. +* `jwt.broker.public.key`: configure this option if you use the PUBLIC mode. +* `jwt.broker.private.key`: configure this option if you use the PRIVATE mode. +* `jwt.broker.secret.key`: configure this option if you use the SECRET mode. + +For more information, see [Token Authentication Admin of Pulsar](http://pulsar.apache.org/docs/en/security-token-admin/). + + +If you want to enable JWT authentication, use one of the following methods. + + +* Method 1: use command-line tool + +``` + +wget https://dist.apache.org/repos/dist/release/pulsar/pulsar-manager/pulsar-manager-0.2.0/apache-pulsar-manager-0.2.0-bin.tar.gz +tar -zxvf apache-pulsar-manager-0.2.0-bin.tar.gz +cd pulsar-manager +tar -zxvf pulsar-manager.tar +cd pulsar-manager +cp -r ../dist ui +./bin/pulsar-manager --redirect.host=http://localhost --redirect.port=9527 insert.stats.interval=600000 --backend.jwt.token=token --jwt.broker.token.mode=PRIVATE --jwt.broker.private.key=file:///path/broker-private.key --jwt.broker.public.key=file:///path/broker-public.key + +``` + +Firstly, [set the administrator account and password](#set-administrator-account-and-password) + +Secondly, log in to Pulsar manager through http://localhost:7750/ui/index.html. + +* Method 2: configure the application.properties file + +``` + +backend.jwt.token=token + +jwt.broker.token.mode=PRIVATE +jwt.broker.public.key=file:///path/broker-public.key +jwt.broker.private.key=file:///path/broker-private.key + +or +jwt.broker.token.mode=SECRET +jwt.broker.secret.key=file:///path/broker-secret.key + +``` + +* Method 3: use Docker and enable token authentication. + +``` + +export JWT_TOKEN="your-token" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -v $PWD:/data apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --secret-key` or `bin/pulsar tokens create --private-key` command. +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* Method 4: use Docker and turn on **token authentication** and **token management** by private key and public key. + +``` + +export JWT_TOKEN="your-token" +export PRIVATE_KEY="file:///pulsar-manager/secret/my-private.key" +export PUBLIC_KEY="file:///pulsar-manager/secret/my-public.key" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e PRIVATE_KEY=$PRIVATE_KEY -e PUBLIC_KEY=$PUBLIC_KEY -v $PWD:/data -v $PWD/secret:/pulsar-manager/secret apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --private-key` command. +* `PRIVATE_KEY`: private key path mounted in container, generated by `bin/pulsar tokens create-key-pair` command. +* `PUBLIC_KEY`: public key path mounted in container, generated by `bin/pulsar tokens create-key-pair` command. +* `$PWD/secret`: the folder where the private key and public key generated by the `bin/pulsar tokens create-key-pair` command are placed locally +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* Method 5: use Docker and turn on **token authentication** and **token management** by secret key. + +``` + +export JWT_TOKEN="your-token" +export SECRET_KEY="file:///pulsar-manager/secret/my-secret.key" +docker run -it -p 9527:9527 -p 7750:7750 -e REDIRECT_HOST=http://localhost -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e SECRET_KEY=$SECRET_KEY -v $PWD:/data -v $PWD/secret:/pulsar-manager/secret apachepulsar/pulsar-manager:v0.2.0 /bin/sh + +``` + +* `JWT_TOKEN`: the token of superuser configured for the broker. It is generated by the `bin/pulsar tokens create --secret-key` command. +* `SECRET_KEY`: secret key path mounted in container, generated by `bin/pulsar tokens create-secret-key` command. +* `$PWD/secret`: the folder where the secret key generated by the `bin/pulsar tokens create-secret-key` command are placed locally +* `REDIRECT_HOST`: the IP address of the front-end server. +* `REDIRECT_PORT`: the port of the front-end server. +* `DRIVER_CLASS_NAME`: the driver class name of the PostgreSQL database. +* `URL`: the JDBC URL of your PostgreSQL database, such as jdbc:postgresql://127.0.0.1:5432/pulsar_manager. The docker image automatically start a local instance of the PostgreSQL database. +* `USERNAME`: the username of PostgreSQL. +* `PASSWORD`: the password of PostgreSQL. +* `LOG_LEVEL`: the level of log. + +* For more information about backend configurations, see [here](https://github.com/apache/pulsar-manager/blob/master/src/README). +* For more information about frontend configurations, see [here](https://github.com/apache/pulsar-manager/tree/master/front-end). + +## Log in + +[Set the administrator account and password](#set-administrator-account-and-password). + +Visit http://localhost:9527 to log in. diff --git a/site2/website/versioned_docs/version-2.9.x/administration-stats.md b/site2/website/versioned_docs/version-2.9.x/administration-stats.md new file mode 100644 index 0000000000000..ac0c03602f36d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-stats.md @@ -0,0 +1,64 @@ +--- +id: administration-stats +title: Pulsar stats +sidebar_label: "Pulsar statistics" +original_id: administration-stats +--- + +## Partitioned topics + +|Stat|Description| +|---|---| +|msgRateIn| The sum of publish rates of all local and replication publishers in messages per second.| +|msgThroughputIn| Same as msgRateIn but in bytes per second instead of messages per second.| +|msgRateOut| The sum of dispatch rates of all local and replication consumers in messages per second.| +|msgThroughputOut| Same as msgRateOut but in bytes per second instead of messages per second.| +|averageMsgSize| Average message size, in bytes, from this publisher within the last interval.| +|storageSize| The sum of storage size of the ledgers for this topic.| +|publishers| The list of all local publishers into the topic. Publishers can be anywhere from zero to thousands.| +|producerId| Internal identifier for this producer on this topic.| +|producerName| Internal identifier for this producer, generated by the client library.| +|address| IP address and source port for the connection of this producer.| +|connectedSince| Timestamp this producer is created or last reconnected.| +|subscriptions| The list of all local subscriptions to the topic.| +|my-subscription| The name of this subscription (client defined).| +|msgBacklog| The count of messages in backlog for this subscription.| +|type| This subscription type.| +|msgRateExpired| The rate at which messages are discarded instead of dispatched from this subscription due to TTL.| +|consumers| The list of connected consumers for this subscription.| +|consumerName| Internal identifier for this consumer, generated by the client library.| +|availablePermits| The number of messages this consumer has space for in the listen queue of client library. A value of 0 means the queue of client library is full and receive() is not being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication| This section gives the stats for cross-colo replication of this topic.| +|replicationBacklog| The outbound replication backlog in messages.| +|connected| Whether the outbound replicator is connected.| +|replicationDelayInSeconds| How long the oldest message has been waiting to be sent through the connection, if connected is true.| +|inboundConnection| The IP and port of the broker in the publisher connection of remote cluster to this broker. | +|inboundConnectedSince| The TCP connection being used to publish messages to the remote cluster. If no local publishers are connected, this connection is automatically closed after a minute.| + + +## Topics + +|Stat|Description| +|---|---| +|entriesAddedCounter| Messages published since this broker loads this topic.| +|numberOfEntries| Total number of messages being tracked.| +|totalSize| Total storage size in bytes of all messages.| +|currentLedgerEntries| Count of messages written to the ledger currently open for writing.| +|currentLedgerSize| Size in bytes of messages written to ledger currently open for writing.| +|lastLedgerCreatedTimestamp| Time when last ledger is created.| +|lastLedgerCreationFailureTimestamp| Time when last ledger is failed.| +|waitingCursorsCount| How many cursors are caught up and waiting for a new message to be published.| +|pendingAddEntriesCount| How many messages have (asynchronous) write requests you are waiting on completion.| +|lastConfirmedEntry| The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger is opened or is being currently opened but has no entries written yet.| +|state| The state of the cursor ledger. Open means you have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers| The ordered list of all ledgers for this topic holding its messages.| +|cursors| The list of all cursors on this topic. Every subscription you saw in the topic stats has one.| +|markDeletePosition| The ack position: the last message the subscriber acknowledges receiving.| +|readPosition| The latest position of subscriber for reading message.| +|waitingReadOp| This is true when the subscription reads the latest message that is published to the topic and waits on new messages to be published.| +|pendingReadOps| The counter for how many outstanding read requests to the BookKeepers you have in progress.| +|messagesConsumedCounter| Number of messages this cursor acks since this broker loads this topic.| +|cursorLedger| The ledger used to persistently store the current markDeletePosition.| +|cursorLedgerLastEntry| The last entryid used to persistently store the current markDeletePosition.| +|individuallyDeletedMessages| If Acks are done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position.| +|lastLedgerSwitchTimestamp| The last time the cursor ledger is rolled over.| diff --git a/site2/website/versioned_docs/version-2.9.x/administration-upgrade.md b/site2/website/versioned_docs/version-2.9.x/administration-upgrade.md new file mode 100644 index 0000000000000..72d136b6460f6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-upgrade.md @@ -0,0 +1,168 @@ +--- +id: administration-upgrade +title: Upgrade Guide +sidebar_label: "Upgrade" +original_id: administration-upgrade +--- + +## Upgrade guidelines + +Apache Pulsar is comprised of multiple components, ZooKeeper, bookies, and brokers. These components are either stateful or stateless. You do not have to upgrade ZooKeeper nodes unless you have special requirement. While you upgrade, you need to pay attention to bookies (stateful), brokers and proxies (stateless). + +The following are some guidelines on upgrading a Pulsar cluster. Read the guidelines before upgrading. + +- Backup all your configuration files before upgrading. +- Read guide entirely, make a plan, and then execute the plan. When you make upgrade plan, you need to take your specific requirements and environment into consideration. +- Pay attention to the upgrading order of components. In general, you do not need to upgrade your ZooKeeper or configuration store cluster (the global ZooKeeper cluster). You need to upgrade bookies first, and then upgrade brokers, proxies, and your clients. +- If `autorecovery` is enabled, you need to disable `autorecovery` in the upgrade process, and re-enable it after completing the process. +- Read the release notes carefully for each release. Release notes contain features, configuration changes that might impact your upgrade. +- Upgrade a small subset of nodes of each type to canary test the new version before upgrading all nodes of that type in the cluster. When you have upgraded the canary nodes, run for a while to ensure that they work correctly. +- Upgrade one data center to verify new version before upgrading all data centers if your cluster runs in multi-cluster replicated mode. + +> Note: Currently, Apache Pulsar is compatible between versions. + +## Upgrade sequence + +To upgrade an Apache Pulsar cluster, follow the upgrade sequence. + +1. Upgrade ZooKeeper (optional) +- Canary test: test an upgraded version in one or a small set of ZooKeeper nodes. +- Rolling upgrade: rollout the upgraded version to all ZooKeeper servers incrementally, one at a time. Monitor your dashboard during the whole rolling upgrade process. +2. Upgrade bookies +- Canary test: test an upgraded version in one or a small set of bookies. +- Rolling upgrade: + - a. Disable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -disable + + ``` + + + - b. Rollout the upgraded version to all bookies in the cluster after you determine that a version is safe after canary. + - c. After you upgrade all bookies, re-enable `autorecovery` with the following command. + + ```shell + + bin/bookkeeper shell autorecovery -enable + + ``` + +3. Upgrade brokers +- Canary test: test an upgraded version in one or a small set of brokers. +- Rolling upgrade: rollout the upgraded version to all brokers in the cluster after you determine that a version is safe after canary. +4. Upgrade proxies +- Canary test: test an upgraded version in one or a small set of proxies. +- Rolling upgrade: rollout the upgraded version to all proxies in the cluster after you determine that a version is safe after canary. + +## Upgrade ZooKeeper (optional) +While you upgrade ZooKeeper servers, you can do canary test first, and then upgrade all ZooKeeper servers in the cluster. + +### Canary test + +You can test an upgraded version in one of ZooKeeper servers before upgrading all ZooKeeper servers in your cluster. + +To upgrade ZooKeeper server to a new version, complete the following steps: + +1. Stop a ZooKeeper server. +2. Upgrade the binary and configuration files. +3. Start the ZooKeeper server with the new binary files. +4. Use `pulsar zookeeper-shell` to connect to the newly upgraded ZooKeeper server and run a few commands to verify if it works as expected. +5. Run the ZooKeeper server for a few days, observe and make sure the ZooKeeper cluster runs well. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic ZooKeeper node, revert the binary and configuration, and restart the ZooKeeper with the reverted binary. + +### Upgrade all ZooKeeper servers + +After canary test to upgrade one ZooKeeper in your cluster, you can upgrade all ZooKeeper servers in your cluster. + +You can upgrade all ZooKeeper servers one by one by following steps in canary test. + +## Upgrade bookies + +While you upgrade bookies, you can do canary test first, and then upgrade all bookies in the cluster. +For more details, you can read Apache BookKeeper [Upgrade guide](http://bookkeeper.apache.org/docs/latest/admin/upgrade). + +### Canary test + +You can test an upgraded version in one or a small set of bookies before upgrading all bookies in your cluster. + +To upgrade bookie to a new version, complete the following steps: + +1. Stop a bookie. +2. Upgrade the binary and configuration files. +3. Start the bookie in `ReadOnly` mode to verify if the bookie of this new version runs well for read workload. + + ```shell + + bin/pulsar bookie --readOnly + + ``` + +4. When the bookie runs successfully in `ReadOnly` mode, stop the bookie and restart it in `Write/Read` mode. + + ```shell + + bin/pulsar bookie + + ``` + +5. Observe and make sure the cluster serves both write and read traffic. + +#### Canary rollback + +If issues occur during the canary test, you can shut down the problematic bookie node. Other bookies in the cluster replaces this problematic bookie node with autorecovery. + +### Upgrade all bookies + +After canary test to upgrade some bookies in your cluster, you can upgrade all bookies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, upgrade one bookie at a time. In a downtime upgrade scenario, shut down the entire cluster, upgrade each bookie, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each bookie. + +1. Stop the bookie. +2. Upgrade the software (either new binary or new configuration files). +2. Start the bookie. + +> **Advanced operations** +> When you upgrade a large BookKeeper cluster in a rolling upgrade scenario, upgrading one bookie at a time is slow. If you configure rack-aware or region-aware placement policy, you can upgrade bookies rack by rack or region by region, which speeds up the whole upgrade process. + +## Upgrade brokers and proxies + +The upgrade procedure for brokers and proxies is the same. Brokers and proxies are `stateless`, so upgrading the two services is easy. + +### Canary test + +You can test an upgraded version in one or a small set of nodes before upgrading all nodes in your cluster. + +To upgrade to a new version, complete the following steps: + +1. Stop a broker (or proxy). +2. Upgrade the binary and configuration file. +3. Start a broker (or proxy). + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic broker (or proxy) node. Revert to the old version and restart the broker (or proxy). + +### Upgrade all brokers or proxies + +After canary test to upgrade some brokers or proxies in your cluster, you can upgrade all brokers or proxies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, you can upgrade one broker or one proxy at a time if the size of the cluster is small. If your cluster is large, you can upgrade brokers or proxies in batches. When you upgrade a batch of brokers or proxies, make sure the remaining brokers and proxies in the cluster have enough capacity to handle the traffic during upgrade. + +In a downtime upgrade scenario, shut down the entire cluster, upgrade each broker or proxy, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each broker or proxy. + +1. Stop the broker or proxy. +2. Upgrade the software (either new binary or new configuration files). +3. Start the broker or proxy. diff --git a/site2/website/versioned_docs/version-2.9.x/administration-zk-bk.md b/site2/website/versioned_docs/version-2.9.x/administration-zk-bk.md new file mode 100644 index 0000000000000..8f66fd23a678f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/administration-zk-bk.md @@ -0,0 +1,386 @@ +--- +id: administration-zk-bk +title: ZooKeeper and BookKeeper administration +sidebar_label: "ZooKeeper and BookKeeper" +original_id: administration-zk-bk +--- + +Pulsar relies on two external systems for essential tasks: + +* [ZooKeeper](https://zookeeper.apache.org/) is responsible for a wide variety of configuration-related and coordination-related tasks. +* [BookKeeper](http://bookkeeper.apache.org/) is responsible for [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data. + +ZooKeeper and BookKeeper are both open-source [Apache](https://www.apache.org/) projects. + +> Skip to the [How Pulsar uses ZooKeeper and BookKeeper](#how-pulsar-uses-zookeeper-and-bookkeeper) section below for a more schematic explanation of the role of these two systems in Pulsar. + + +## ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +To deploy a Pulsar instance, you need to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the node ID in `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you can set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy configuration store + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you need to stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, you can assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also you can assume, each cluster has its own local ZK servers named such as + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +### ZooKeeper configuration + +In Pulsar, ZooKeeper configuration is handled by two separate configuration files in the `conf` directory of your Pulsar installation: `conf/zookeeper.conf` for [local ZooKeeper](#local-zookeeper) and `conf/global-zookeeper.conf` for [configuration store](#configuration-store). + +#### Local ZooKeeper + +The [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file handles the configuration for local ZooKeeper. The table below shows the available parameters: + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper stores in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server listens for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, which triggers the ZooKeeper database purge task. Setting to a non-zero number enables auto purge; setting to 0 disables. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + +#### Configuration Store + +The [`conf/global-zookeeper.conf`](reference-configuration.md#configuration-store) file handles the configuration for configuration store. The table below shows the available parameters: + + +## BookKeeper + +BookKeeper stores all durable messages in Pulsar. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) WAL system that guarantees read consistency of independent message logs calls ledgers. Individual BookKeeper servers are also called *bookies*. + +> To manage message persistence, retention, and expiry in Pulsar, refer to [cookbook](cookbooks-retention-expiry.md). + +### Hardware requirements + +Bookie hosts store message data on disk. To provide optimal performance, ensure that the bookies have a suitable hardware configuration. The following are two key dimensions of bookie hardware capacity: + +- Disk I/O capacity read/write +- Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker by default. To ensure low write latency, BookKeeper is designed to use multiple devices: + +- A **journal** to ensure durability. For sequential writes, it is critical to have fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +- A **ledger storage device** stores data. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + +### Configure BookKeeper + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. When you configure each bookie, ensure that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for local ZooKeeper of the Pulsar cluster. + +The minimum configuration changes required in `conf/bookkeeper.conf` are as follows: + +:::note + +Set `journalDirectory` and `ledgerDirectories` carefully. It is difficilt to change them later. + +::: + +```properties + +# Change to point to journal disk mount point +journalDirectory=data/bookkeeper/journal + +# Point to ledger storage disk mount point +ledgerDirectories=data/bookkeeper/ledgers + +# Point to local ZK quorum +zkServers=zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181 + +#It is recommended to set this parameter. Otherwise, BookKeeper can't start normally in certain environments (for example, Huawei Cloud). +advertisedAddress= + +``` + +To change the ZooKeeper root path that BookKeeper uses, use `zkLedgersRootPath=/MY-PREFIX/ledgers` instead of `zkServers=localhost:2181/MY-PREFIX`. + +> For more information about BookKeeper, refer to the official [BookKeeper docs](http://bookkeeper.apache.org). + +### Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. Each Pulsar broker has its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Start bookies manually + +You can start a bookie in the foreground or as a background daemon. + +To start a bookie in the foreground, use the [`bookkeeper`](reference-cli-tools.md#bookkeeper) CLI tool: + +```bash + +$ bin/bookkeeper bookie + +``` + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify whether the bookie works properly with the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell + +$ bin/bookkeeper shell bookiesanity + +``` + +When you use this command, you create a new ledger on the local bookie, write a few entries, read them back and finally delete the ledger. + +### Decommission bookies cleanly + +Before you decommission a bookie, you need to check your environment and meet the following requirements. + +1. Ensure the state of your cluster supports decommissioning the target bookie. Check if `EnsembleSize >= Write Quorum >= Ack Quorum` is `true` with one less bookie. + +2. Ensure the target bookie is listed after using the `listbookies` command. + +3. Ensure that no other process is ongoing (upgrade etc). + +And then you can decommission bookies safely. To decommission bookies, complete the following steps. + +1. Log in to the bookie node, check if there are underreplicated ledgers. The decommission command force to replicate the underreplicated ledgers. +`$ bin/bookkeeper shell listunderreplicated` + +2. Stop the bookie by killing the bookie process. Make sure that no liveness/readiness probes setup for the bookies to spin them back up if you deploy it in a Kubernetes environment. + +3. Run the decommission command. + - If you have logged in to the node to be decommissioned, you do not need to provide `-bookieid`. + - If you are running the decommission command for the target bookie node from another bookie node, you should mention the target bookie ID in the arguments for `-bookieid` + `$ bin/bookkeeper shell decommissionbookie` + or + `$ bin/bookkeeper shell decommissionbookie -bookieid ` + +4. Validate that no ledgers are on the decommissioned bookie. +`$ bin/bookkeeper shell listledgers -bookieid ` + +You can run the following command to check if the bookie you have decommissioned is listed in the bookies list: + +```bash + +./bookkeeper shell listbookies -rw -h +./bookkeeper shell listbookies -ro -h + +``` + +## BookKeeper persistence policies + +In Pulsar, you can set *persistence policies* at the namespace level, which determines how BookKeeper handles persistent storage of messages. Policies determine four things: + +* The number of acks (guaranteed copies) to wait for each ledger entry. +* The number of bookies to use for a topic. +* The number of writes to make for each ledger entry. +* The throttling rate for mark-delete operations. + +### Set persistence policies + +You can set persistence policies for BookKeeper at the [namespace](reference-terminology.md#namespace) level. + +#### Pulsar-admin + +Use the [`set-persistence`](reference-pulsar-admin.md#namespaces-set-persistence) subcommand and specify a namespace as well as any policies that you want to apply. The available flags are: + +Flag | Description | Default +:----|:------------|:------- +`-a`, `--bookkeeper-ack-quorum` | The number of acks (guaranteed copies) to wait on for each entry | 0 +`-e`, `--bookkeeper-ensemble` | The number of [bookies](reference-terminology.md#bookie) to use for topics in the namespace | 0 +`-w`, `--bookkeeper-write-quorum` | The number of writes to make for each entry | 0 +`-r`, `--ml-mark-delete-max-rate` | Throttling rate for mark-delete operations (0 means no throttle) | 0 + +The following is an example: + +```shell + +$ pulsar-admin namespaces set-persistence my-tenant/my-ns \ + --bookkeeper-ack-quorum 3 \ + --bookeeper-ensemble 2 + +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +int bkEnsemble = 2; +int bkQuorum = 3; +int bkAckQuorum = 2; +double markDeleteRate = 0.7; +PersistencePolicies policies = + new PersistencePolicies(ensemble, quorum, ackQuorum, markDeleteRate); +admin.namespaces().setPersistence(namespace, policies); + +``` + +### List persistence policies + +You can see which persistence policy currently applies to a namespace. + +#### Pulsar-admin + +Use the [`get-persistence`](reference-pulsar-admin.md#namespaces-get-persistence) subcommand and specify the namespace. + +The following is an example: + +```shell + +$ pulsar-admin namespaces get-persistence my-tenant/my-ns +{ + "bookkeeperEnsemble": 1, + "bookkeeperWriteQuorum": 1, + "bookkeeperAckQuorum", 1, + "managedLedgerMaxMarkDeleteRate": 0 +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence?version=@pulsar:version_number@} + +#### Java + +```java + +PersistencePolicies policies = admin.namespaces().getPersistence(namespace); + +``` + +## How Pulsar uses ZooKeeper and BookKeeper + +This diagram illustrates the role of ZooKeeper and BookKeeper in a Pulsar cluster: + +![ZooKeeper and BookKeeper](/assets/pulsar-system-architecture.png) + +Each Pulsar cluster consists of one or more message brokers. Each broker relies on an ensemble of bookies. diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-cgo.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-cgo.md new file mode 100644 index 0000000000000..f352f942b7714 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-cgo.md @@ -0,0 +1,579 @@ +--- +id: client-libraries-cgo +title: Pulsar CGo client +sidebar_label: "CGo(deprecated)" +original_id: client-libraries-cgo +--- + +You can use Pulsar Go client to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Go client are thread-safe. + +Currently, the following Go clients are maintained in two repositories. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| CGo | [pulsar-client-go](https://github.com/apache/pulsar/tree/master/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | CGo client that depends on C++ client library | +| Go | [pulsar-client-go](https://github.com/apache/pulsar-client-go) | [Apache Pulsar](https://github.com/apache/pulsar) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar/pulsar-client-go/pulsar). + +## Installation + +### Requirements + +Pulsar Go client library is based on the C++ client library. Follow +the instructions for [C++ library](client-libraries-cpp.md) for installing the binaries through [RPM](client-libraries-cpp.md#rpm), [Deb](client-libraries-cpp.md#deb) or [Homebrew packages](client-libraries-cpp.md#macos). + +### Install go package + +> **Compatibility Warning** +> The version number of the Go client **must match** the version number of the Pulsar C++ client library. + +You can install the `pulsar` library locally using `go get`. Note that `go get` doesn't support fetching a specific tag - it will always pull in master's version of the Go client. You'll need a C++ client library that matches master. + +```bash + +$ go get -u github.com/apache/pulsar/pulsar-client-go/pulsar + +``` + +Or you can use [dep](https://github.com/golang/dep) for managing the dependencies. + +```bash + +$ dep ensure -add github.com/apache/pulsar/pulsar-client-go/pulsar@v@pulsar:version@ + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar/pulsar-client-go/pulsar" + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "runtime" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeoutSeconds: 5, + MessageListenerThreads: runtime.NumCPU(), + }) + + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } +} + +``` + +The following configurable parameters are available for Pulsar clients: + +Parameter | Description | Default +:---------|:------------|:------- +`URL` | The connection URL for the Pulsar cluster. See [above](#urls) for more info | +`IOThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker) | 1 +`OperationTimeoutSeconds` | The timeout for some Go client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 +`MessageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)) | 1 +`ConcurrentLookupRequests` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 5000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 5000 +`Logger` | A custom logger implementation for the client (as a function that takes a log level, file path, line number, and message). All info, warn, and error messages will be routed to this function. | `nil` +`TLSTrustCertsFilePath` | The file path for the trusted TLS certificate | +`TLSAllowInsecureConnection` | Whether the client accepts untrusted TLS certificates from the broker | `false` +`Authentication` | Configure the authentication provider. (default: no authentication). Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | `nil` +`StatsIntervalInSeconds` | The interval (in seconds) at which client stats are published | 60 + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatalf("Could not instantiate Pulsar producer: %v", err) +} + +defer producer.Close() + +msg := pulsar.ProducerMessage{ + Payload: []byte("Hello, Pulsar"), +} + +if err := producer.Send(context.Background(), msg); err != nil { + log.Fatalf("Producer could not send message: %v", err) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar producer, the operation will block (waiting on a go channel) until either a producer is successfully created or an error is thrown. + + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | `error` +`SendAndGetMsgID(context.Context, ProducerMessage)`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | (MessageID, error) +`SendAsync(context.Context, ProducerMessage, func(ProducerMessage, error))` | Publishes a [message](#messages) to the producer's topic asynchronously. The third argument is a callback function that specifies what happens either when the message is acknowledged or an error is thrown. | +`SendAndGetMsgIDAsync(context.Context, ProducerMessage, func(MessageID, error))`| Send a message in asynchronous mode. The callback will report back the message being published and the eventual error in publishing | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | `error` +`Schema()` | | Schema + +Here's a more involved example usage of a producer: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client to instantiate a producer + producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", + }) + + if err != nil { log.Fatal(err) } + + ctx := context.Background() + + // Send 10 messages synchronously and 10 messages asynchronously + for i := 0; i < 10; i++ { + // Create a message + msg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("message-%d", i)), + } + + // Attempt to send the message + if err := producer.Send(ctx, msg); err != nil { + log.Fatal(err) + } + + // Create a different message to send asynchronously + asyncMsg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("async-message-%d", i)), + } + + // Attempt to send the message asynchronously and handle the response + producer.SendAsync(ctx, asyncMsg, func(msg pulsar.ProducerMessage, err error) { + if err != nil { log.Fatal(err) } + + fmt.Printf("the %s successfully published", string(msg.Payload)) + }) + } +} + +``` + +### Producer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer will publish messages | +`Name` | A name for the producer. If you don't explicitly assign a name, Pulsar will automatically generate a globally unique name that you can access later using the `Name()` method. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | +`Properties`| Attach a set of application defined properties to the producer. This properties will be visible in the topic stats | +`SendTimeout` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `SendTimeout` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30 seconds +`MaxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `Send` and `SendAsync` methods will fail *unless* `BlockIfQueueFull` is set to `true`. | +`MaxPendingMessagesAcrossPartitions` | Set the number of max pending messages across all the partitions. This setting will be used to lower the max pending messages for each partition `MaxPendingMessages(int)`, if the total exceeds the configured value.| +`BlockIfQueueFull` | If set to `true`, the producer's `Send` and `SendAsync` methods will block when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `MaxPendingMessages` parameter); if set to `false` (the default), `Send` and `SendAsync` operations will fail and throw a `ProducerQueueIsFullError` when the queue is full. | `false` +`MessageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`pulsar.RoundRobinDistribution`, the default), publishing all messages to a single partition (`pulsar.UseSinglePartition`), or a custom partitioning scheme (`pulsar.CustomPartition`). | `pulsar.RoundRobinDistribution` +`HashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `pulsar.JavaStringHash` (the equivalent of `String.hashCode()` in Java), `pulsar.Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `pulsar.BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library) | `pulsar.JavaStringHash` +`CompressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), [`ZLIB`](https://zlib.net/), [`ZSTD`](https://facebook.github.io/zstd/) and [`SNAPPY`](https://google.github.io/snappy/). | No compression +`MessageRouter` | By default, Pulsar uses a round-robin routing scheme for [partitioned topics](cookbooks-partitioned.md). The `MessageRouter` parameter enables you to specify custom routing logic via a function that takes the Pulsar message and topic metadata as an argument and returns an integer (where the ), i.e. a function signature of `func(Message, TopicMetadata) int`. | +`Batching` | Control whether automatic batching of messages is enabled for the producer. | false +`BatchingMaxPublishDelay` | Set the time period within which the messages sent will be batched (default: 1ms) if batch messages are enabled. If set to a non zero value, messages will be queued until this time interval or until | 1ms +`BatchingMaxMessages` | Set the maximum number of messages permitted in a batch. (default: 1000) If set to a value greater than 1, messages will be queued until this threshold is reached or batch interval has elapsed | 1000 + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +msgChannel := make(chan pulsar.ConsumerMessage) + +consumerOpts := pulsar.ConsumerOptions{ + Topic: "my-topic", + SubscriptionName: "my-subscription-1", + Type: pulsar.Exclusive, + MessageChannel: msgChannel, +} + +consumer, err := client.Subscribe(consumerOpts) + +if err != nil { + log.Fatalf("Could not establish subscription: %v", err) +} + +defer consumer.Close() + +for cm := range msgChannel { + msg := cm.Message + + fmt.Printf("Message ID: %s", msg.ID()) + fmt.Printf("Message value: %s", string(msg.Payload())) + + consumer.Ack(msg) +} + +``` + +> **Blocking operation** +> When you create a new Pulsar consumer, the operation will block (on a go channel) until either a producer is successfully created or an error is thrown. + + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the consumer's [topic](reference-terminology.md#topic) | `string` +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | `error` +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | `error` +`AckCumulative(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `AckCumulative` method will block until the ack has been sent to the broker. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can only be used with a [shared](concepts-messaging.md#shared) subscription type. | `error` +`AckCumulativeID(MessageID)` |Ack the reception of all the messages in the stream up to (and including) the provided message. This method will block until the acknowledge has been sent to the broker. After that, the messages will not be re-delivered to this consumer. | error +`Nack(Message)` | Acknowledge the failure to process a single message. | `error` +`NackID(MessageID)` | Acknowledge the failure to process a single message. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | `error` +`RedeliverUnackedMessages()` | Redelivers *all* unacknowledged messages on the topic. In [failover](concepts-messaging.md#failover) mode, this request is ignored if the consumer isn't active on the specified topic; in [shared](concepts-messaging.md#shared) mode, redelivered messages are distributed across all consumers connected to the topic. **Note**: this is a *non-blocking* operation that doesn't throw an error. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | error + +#### Receive example + +Here's an example usage of a Go consumer that uses the `Receive()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client object to instantiate a consumer + consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-golang-topic", + SubscriptionName: "sub-1", + Type: pulsar.Exclusive, + }) + + if err != nil { log.Fatal(err) } + + defer consumer.Close() + + ctx := context.Background() + + // Listen indefinitely on the topic + for { + msg, err := consumer.Receive(ctx) + if err != nil { log.Fatal(err) } + + // Do something with the message + err = processMessage(msg) + + if err == nil { + // Message processed successfully + consumer.Ack(msg) + } else { + // Failed to process messages + consumer.Nack(msg) + } + } +} + +``` + +### Consumer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the consumer will establish a subscription and listen for messages | +`Topics` | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing | +`TopicsPattern` | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | +`SubscriptionName` | The subscription name for this consumer | +`Properties` | Attach a set of application defined properties to the consumer. This properties will be visible in the topic stats| +`Name` | The name of the consumer | +`AckTimeout` | Set the timeout for unacked messages | 0 +`NackRedeliveryDelay` | The delay after which to redeliver the messages that failed to be processed. Default is 1min. (See `Consumer.Nack()`) | 1 minute +`Type` | Available options are `Exclusive`, `Shared`, and `Failover` | `Exclusive` +`SubscriptionInitPos` | InitialPosition at which the cursor will be set when subscribe | Latest +`MessageChannel` | The Go channel used by the consumer. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `Receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 +`MaxTotalReceiverQueueSizeAcrossPartitions` |Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value | 50000 +`ReadCompacted` | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the consumer will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal. | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageId: pulsar.LatestMessage, +}) + +``` + +> **Blocking operation** +> When you create a new Pulsar reader, the operation will block (on a go channel) until either a reader is successfully created or an error is thrown. + + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` + +#### "Next" example + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatalf("Could not create client: %v", err) } + + // Use the client to instantiate a reader + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.EarliestMessage, + }) + + if err != nil { log.Fatalf("Could not create reader: %v", err) } + + defer reader.Close() + + ctx := context.Background() + + // Listen on the topic for incoming messages + for { + msg, err := reader.Next(ctx) + if err != nil { log.Fatalf("Error reading from topic: %v", err) } + + // Process the message + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: DeserializeMessageID(lastSavedId), +}) + +``` + +### Reader configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader will establish a subscription and listen for messages +`Name` | The name of the reader +`StartMessageID` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `pulsar.EarliestMessage` (the earliest available message on the topic), `pulsar.LatestMessage` (the latest available message on the topic), or a `MessageID` object for a position that isn't earliest or latest. | +`MessageChannel` | The Go channel used by the reader. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `Next`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 +`SubscriptionRolePrefix` | The subscription role prefix. | `reader` +`ReadCompacted` | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the reader will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal.| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## Schema + +This example shows how to create a producer and consumer with schema. + +```go + +var exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +jsonSchema := NewJsonSchema(exampleSchemaDef, nil) +// create producer +producer, err := client.CreateProducerWithSchema(ProducerOptions{ + Topic: "jsonTopic", +}, jsonSchema) +err = producer.Send(context.Background(), ProducerMessage{ + Value: &testJson{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() +//create consumer +var s testJson +consumerJS := NewJsonSchema(exampleSchemaDef, nil) +consumer, err := client.SubscribeWithSchema(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-2", +}, consumerJS) +if err != nil { + log.Fatal(err) +} +msg, err := consumer.Receive(context.Background()) +if err != nil { + log.Fatal(err) +} +err = msg.GetValue(&s) +if err != nil { + log.Fatal(err) +} +fmt.Println(s.ID) // output: 100 +fmt.Println(s.Name) // output: pulsar +defer consumer.Close() + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-cpp.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-cpp.md new file mode 100644 index 0000000000000..455cf02116d50 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-cpp.md @@ -0,0 +1,708 @@ +--- +id: client-libraries-cpp +title: Pulsar C++ client +sidebar_label: "C++" +original_id: client-libraries-cpp +--- + +You can use Pulsar C++ client to create Pulsar producers and consumers in C++. + +All the methods in producer, consumer, and reader of a C++ client are thread-safe. + +## Supported platforms + +Pulsar C++ client is supported on **Linux** ,**MacOS** and **Windows** platforms. + +[Doxygen](http://www.doxygen.nl/)-generated API docs for the C++ client are available [here](/api/cpp). + +## System requirements + +You need to install the following components before using the C++ client: + +* [CMake](https://cmake.org/) +* [Boost](http://www.boost.org/) +* [Protocol Buffers](https://developers.google.com/protocol-buffers/) >= 3 +* [libcurl](https://curl.se/libcurl/) +* [Google Test](https://github.com/google/googletest) + +## Linux + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +$ apt-get install cmake libssl-dev libcurl4-openssl-dev liblog4cxx-dev \ + libprotobuf-dev protobuf-compiler libboost-all-dev google-mock libgtest-dev libjsoncpp-dev + +``` + +3. Compile and install [Google Test](https://github.com/google/googletest). + +```shell + +# libgtest-dev version is 1.18.0 or above +$ cd /usr/src/googletest +$ sudo cmake . +$ sudo make +$ sudo cp ./googlemock/libgmock.a ./googlemock/gtest/libgtest.a /usr/lib/ + +# less than 1.18.0 +$ cd /usr/src/gtest +$ sudo cmake . +$ sudo make +$ sudo cp libgtest.a /usr/lib + +$ cd /usr/src/gmock +$ sudo cmake . +$ sudo make +$ sudo cp libgmock.a /usr/lib + +``` + +4. Compile the Pulsar client library for C++ inside the Pulsar repository. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +After you install the components successfully, the files `libpulsar.so` and `libpulsar.a` are in the `lib` folder of the repository. The tools `perfProducer` and `perfConsumer` are in the `perf` directory. + +### Install Dependencies + +> Since 2.1.0 release, Pulsar ships pre-built RPM and Debian packages. You can download and install those packages directly. + +After you download and install RPM or DEB, the `libpulsar.so`, `libpulsarnossl.so`, `libpulsar.a`, and `libpulsarwithdeps.a` libraries are in your `/usr/lib` directory. + +By default, they are built in code path `${PULSAR_HOME}/pulsar-client-cpp`. You can build with the command below. + + `cmake . -DBUILD_TESTS=OFF -DLINK_STATIC=ON && make pulsarShared pulsarSharedNossl pulsarStatic pulsarStaticWithDeps -j 3`. + +These libraries rely on some other libraries. If you want to get detailed version of dependencies, see [RPM](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/rpm/Dockerfile) or [DEB](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/pkg/deb/Dockerfile) files. + +1. `libpulsar.so` is a shared library, containing statically linked `boost` and `openssl`. It also dynamically links all other necessary libraries. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.so -I/usr/local/ssl/include + +``` + +2. `libpulsarnossl.so` is a shared library, similar to `libpulsar.so` except that the libraries `openssl` and `crypto` are dynamically linked. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarnossl.so -lssl -lcrypto -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +3. `libpulsar.a` is a static library. You need to load dependencies before using this library. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsar.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib -lboost_system -lboost_regex -lcurl -lprotobuf -lzstd -lz + +``` + +4. `libpulsarwithdeps.a` is a static library, based on `libpulsar.a`. It is archived in the dependencies of `libboost_regex`, `libboost_system`, `libcurl`, `libprotobuf`, `libzstd` and `libz`. You can use this Pulsar library with the command below. + +```bash + + g++ --std=c++11 PulsarTest.cpp -o test /usr/lib/libpulsarwithdeps.a -lssl -lcrypto -ldl -lpthread -I/usr/local/ssl/include -L/usr/local/ssl/lib + +``` + +The `libpulsarwithdeps.a` does not include library openssl related libraries `libssl` and `libcrypto`, because these two libraries are related to security. It is more reasonable and easier to use the versions provided by the local system to handle security issues and upgrade libraries. + +### Install RPM + +1. Download a RPM package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:dist_rpm:client@) | [asc](@pulsar:dist_rpm:client@.asc), [sha512](@pulsar:dist_rpm:client@.sha512) | +| [client-debuginfo](@pulsar:dist_rpm:client-debuginfo@) | [asc](@pulsar:dist_rpm:client-debuginfo@.asc), [sha512](@pulsar:dist_rpm:client-debuginfo@.sha512) | +| [client-devel](@pulsar:dist_rpm:client-devel@) | [asc](@pulsar:dist_rpm:client-devel@.asc), [sha512](@pulsar:dist_rpm:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ rpm -ivh apache-pulsar-client*.rpm + +``` + +After you install RPM successfully, Pulsar libraries are in the `/usr/lib` directory. + +:::note + +If you get the error that `libpulsar.so: cannot open shared object file: No such file or directory` when starting Pulsar client, you may need to run `ldconfig` first. + +::: + +### Install Debian + +1. Download a Debian package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client](@pulsar:deb:client@) | [asc](@pulsar:dist_deb:client@.asc), [sha512](@pulsar:dist_deb:client@.sha512) | +| [client-devel](@pulsar:deb:client-devel@) | [asc](@pulsar:dist_deb:client-devel@.asc), [sha512](@pulsar:dist_deb:client-devel@.sha512) | + +2. Install the package using the following command. + +```bash + +$ apt install ./apache-pulsar-client*.deb + +``` + +After you install DEB successfully, Pulsar libraries are in the `/usr/lib` directory. + +### Build + +> If you want to build RPM and Debian packages from the latest master, follow the instructions below. You should run all the instructions at the root directory of your cloned Pulsar repository. + +There are recipes that build RPM and Debian packages containing a +statically linked `libpulsar.so` / `libpulsarnossl.so` / `libpulsar.a` / `libpulsarwithdeps.a` with all required dependencies. + +To build the C++ library packages, you need to build the Java packages first. + +```shell + +mvn install -DskipTests + +``` + +#### RPM + +To build the RPM inside a Docker container, use the command below. The RPMs are in the `pulsar-client-cpp/pkg/rpm/RPMS/x86_64/` path. + +```shell + +pulsar-client-cpp/pkg/rpm/docker-build-rpm.sh + +``` + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-devel | Static library `libpulsar.a`, `libpulsarwithdeps.a`and C++ and C headers | +| pulsar-client-debuginfo | Debug symbols for `libpulsar.so` | + +#### Debian + +To build Debian packages, enter the following command. + +```shell + +pulsar-client-cpp/pkg/deb/docker-build-deb.sh + +``` + +Debian packages are created in the `pulsar-client-cpp/pkg/deb/BUILD/DEB/` path. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` and `libpulsarnossl.so` | +| pulsar-client-dev | Static library `libpulsar.a`, `libpulsarwithdeps.a` and C++ and C headers | + +## MacOS + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +# OpenSSL installation +$ brew install openssl +$ export OPENSSL_INCLUDE_DIR=/usr/local/opt/openssl/include/ +$ export OPENSSL_ROOT_DIR=/usr/local/opt/openssl/ + +# Protocol Buffers installation +$ brew install protobuf boost boost-python log4cxx +# If you are using python3, you need to install boost-python3 + +# Google Test installation +$ git clone https://github.com/google/googletest.git +$ cd googletest +$ git checkout release-1.12.1 +$ cmake . +$ make install + +``` + +3. Compile the Pulsar client library in the repository that you cloned. + +```shell + +$ cd pulsar-client-cpp +$ cmake . +$ make + +``` + +### Install `libpulsar` + +Pulsar releases are available in the [Homebrew](https://brew.sh/) core repository. You can install the C++ client library with the following command. The package is installed with the library and headers. + +```shell + +brew install libpulsar + +``` + +## Windows (64-bit) + +### Compilation + +1. Clone the Pulsar repository. + +```shell + +$ git clone https://github.com/apache/pulsar + +``` + +2. Install all necessary dependencies. + +```shell + +cd ${PULSAR_HOME}/pulsar-client-cpp +vcpkg install --feature-flags=manifests --triplet x64-windows + +``` + +3. Build C++ libraries. + +```shell + +cmake -B ./build -A x64 -DBUILD_PYTHON_WRAPPER=OFF -DBUILD_TESTS=OFF -DVCPKG_TRIPLET=x64-windows -DCMAKE_BUILD_TYPE=Release -S . +cmake --build ./build --config Release + +``` + +> **NOTE** +> +> 1. For Windows 32-bit, you need to use `-A Win32` and `-DVCPKG_TRIPLET=x86-windows`. +> 2. For MSVC Debug mode, you need to replace `Release` with `Debug` for both `CMAKE_BUILD_TYPE` variable and `--config` option. + +4. Client libraries are available in the following places. + +``` + +${PULSAR_HOME}/pulsar-client-cpp/build/lib/Release/pulsar.lib +${PULSAR_HOME}/pulsar-client-cpp/build/lib/Release/pulsar.dll + +``` + +## Connection URLs + +To connect Pulsar using client libraries, you need to specify a Pulsar protocol URL. + +Pulsar protocol URLs are assigned to specific clusters, you can use the Pulsar URI scheme. The default port is `6650`. The following is an example for localhost. + +```http + +pulsar://localhost:6650 + +``` + +In a Pulsar cluster in production, the URL looks as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use TLS authentication, you need to add `ssl`, and the default port is `6651`. The following is an example. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a consumer + +To use Pulsar as a consumer, you need to create a consumer on the C++ client. There are two main ways of using the consumer: +- [Blocking style](#blocking-example): synchronously calling `receive(msg)`. +- [Non-blocking](#consumer-with-a-message-listener) (event based) style: using a message listener. + +### Blocking example + +The benefit of this approach is that it is the simplest code. Simply keeps calling `receive(msg)` which blocks until a message is received. + +This example starts a subscription at the earliest offset and consumes 100 messages. + +```c++ + +#include + +using namespace pulsar; + +int main() { + Client client("pulsar://localhost:6650"); + + Consumer consumer; + ConsumerConfiguration config; + config.setSubscriptionInitialPosition(InitialPositionEarliest); + Result result = client.subscribe("persistent://public/default/my-topic", "consumer-1", config, consumer); + if (result != ResultOk) { + std::cout << "Failed to subscribe: " << result << std::endl; + return -1; + } + + Message msg; + int ctr = 0; + // consume 100 messages + while (ctr < 100) { + consumer.receive(msg); + std::cout << "Received: " << msg + << " with payload '" << msg.getDataAsString() << "'" << std::endl; + + consumer.acknowledge(msg); + ctr++; + } + + std::cout << "Finished consuming synchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Consumer with a message listener + +You can avoid running a loop with blocking calls with an event based style by using a message listener which is invoked for each message that is received. + +This example starts a subscription at the earliest offset and consumes 100 messages. + +```c++ + +#include +#include +#include + +using namespace pulsar; + +std::atomic messagesReceived; + +void handleAckComplete(Result res) { + std::cout << "Ack res: " << res << std::endl; +} + +void listener(Consumer consumer, const Message& msg) { + std::cout << "Got message " << msg << " with content '" << msg.getDataAsString() << "'" << std::endl; + messagesReceived++; + consumer.acknowledgeAsync(msg.getMessageId(), handleAckComplete); +} + +int main() { + Client client("pulsar://localhost:6650"); + + Consumer consumer; + ConsumerConfiguration config; + config.setMessageListener(listener); + config.setSubscriptionInitialPosition(InitialPositionEarliest); + Result result = client.subscribe("persistent://public/default/my-topic", "consumer-1", config, consumer); + if (result != ResultOk) { + std::cout << "Failed to subscribe: " << result << std::endl; + return -1; + } + + // wait for 100 messages to be consumed + while (messagesReceived < 100) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + std::cout << "Finished consuming asynchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +## Create a producer + +To use Pulsar as a producer, you need to create a producer on the C++ client. There are two main ways of using a producer: +- [Blocking style](#simple-blocking-example) : each call to `send` waits for an ack from the broker. +- [Non-blocking asynchronous style](#non-blocking-example) : `sendAsync` is called instead of `send` and a callback is supplied for when the ack is received from the broker. + +### Simple blocking example + +This example sends 100 messages using the blocking style. While simple, it does not produce high throughput as it waits for each ack to come back before sending the next message. + +```c++ + +#include +#include + +using namespace pulsar; + +int main() { + Client client("pulsar://localhost:6650"); + + Result result = client.createProducer("persistent://public/default/my-topic", producer); + if (result != ResultOk) { + std::cout << "Error creating producer: " << result << std::endl; + return -1; + } + + // Send 100 messages synchronously + int ctr = 0; + while (ctr < 100) { + std::string content = "msg" + std::to_string(ctr); + Message msg = MessageBuilder().setContent(content).setProperty("x", "1").build(); + Result result = producer.send(msg); + if (result != ResultOk) { + std::cout << "The message " << content << " could not be sent, received code: " << result << std::endl; + } else { + std::cout << "The message " << content << " sent successfully" << std::endl; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ctr++; + } + + std::cout << "Finished producing synchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Non-blocking example + +This example sends 100 messages using the non-blocking style calling `sendAsync` instead of `send`. This allows the producer to have multiple messages inflight at a time which increases throughput. + +The producer configuration `blockIfQueueFull` is useful here to avoid `ResultProducerQueueIsFull` errors when the internal queue for outgoing send requests becomes full. Once the internal queue is full, `sendAsync` becomes blocking which can make your code simpler. + +Without this configuration, the result code `ResultProducerQueueIsFull` is passed to the callback. You must decide how to deal with that (retry, discard etc). + +```c++ + +#include +#include + +using namespace pulsar; + +std::atomic acksReceived; + +void callback(Result code, const MessageId& msgId, std::string msgContent) { + // message processing logic here + std::cout << "Received ack for msg: " << msgContent << " with code: " + << code << " -- MsgID: " << msgId << std::endl; + acksReceived++; +} + +int main() { + Client client("pulsar://localhost:6650"); + + ProducerConfiguration producerConf; + producerConf.setBlockIfQueueFull(true); + Producer producer; + Result result = client.createProducer("persistent://public/default/my-topic", + producerConf, producer); + if (result != ResultOk) { + std::cout << "Error creating producer: " << result << std::endl; + return -1; + } + + // Send 100 messages asynchronously + int ctr = 0; + while (ctr < 100) { + std::string content = "msg" + std::to_string(ctr); + Message msg = MessageBuilder().setContent(content).setProperty("x", "1").build(); + producer.sendAsync(msg, std::bind(callback, + std::placeholders::_1, std::placeholders::_2, content)); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ctr++; + } + + // wait for 100 messages to be acked + while (acksReceived < 100) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + + std::cout << "Finished producing asynchronously!" << std::endl; + + client.close(); + return 0; +} + +``` + +### Partitioned topics and lazy producers + +When scaling out a Pulsar topic, you may configure a topic to have hundreds of partitions. Likewise, you may have also scaled out your producers so there are hundreds or even thousands of producers. This can put some strain on the Pulsar brokers as when you create a producer on a partitioned topic, internally it creates one internal producer per partition which involves communications to the brokers for each one. So for a topic with 1000 partitions and 1000 producers, it ends up creating 1,000,000 internal producers across the producer applications, each of which has to communicate with a broker to find out which broker it should connect to and then perform the connection handshake. + +You can reduce the load caused by this combination of a large number of partitions and many producers by doing the following: +- use SinglePartition partition routing mode (this ensures that all messages are only sent to a single, randomly selected partition) +- use non-keyed messages (when messages are keyed, routing is based on the hash of the key and so messages will end up being sent to multiple partitions) +- use lazy producers (this ensures that an internal producer is only created on demand when a message needs to be routed to a partition) + +With our example above, that reduces the number of internal producers spread out over the 1000 producer apps from 1,000,000 to just 1000. + +Note that there can be extra latency for the first message sent. If you set a low send timeout, this timeout could be reached if the initial connection handshake is slow to complete. + +```c++ + +ProducerConfiguration producerConf; +producerConf.setPartitionsRoutingMode(ProducerConfiguration::UseSinglePartition); +producerConf.setLazyStartPartitionedProducers(true); + +``` + +## Enable authentication in connection URLs +If you use TLS authentication when connecting to Pulsar, you need to add `ssl` in the connection URLs, and the default port is `6651`. The following is an example. + +```cpp + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://my-broker.com:6651", config); + +``` + +For complete examples, refer to [C++ client examples](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/examples). + +## Schema + +This section describes some examples about schema. For more information about +schema, see [Pulsar schema](schema-get-started.md). + +### Avro schema + +- The following example shows how to create a producer with an Avro schema. + + ```cpp + + static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; + Producer producer; + ProducerConfiguration producerConf; + producerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); + client.createProducer("topic-avro", producerConf, producer); + + ``` + +- The following example shows how to create a consumer with an Avro schema. + + ```cpp + + static const std::string exampleSchema = + "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + "\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"b\",\"type\":\"int\"}]}"; + ConsumerConfiguration consumerConf; + Consumer consumer; + consumerConf.setSchema(SchemaInfo(AVRO, "Avro", exampleSchema)); + client.subscribe("topic-avro", "sub-2", consumerConf, consumer) + + ``` + +### ProtobufNative schema + +The following example shows how to create a producer and a consumer with a ProtobufNative schema. +​ +1. Generate the `User` class using Protobuf3. + + :::note + + You need to use Protobuf3 or later versions. + + ::: + +​ + + ```protobuf + + syntax = "proto3"; + + message User { + string name = 1; + int32 age = 2; + } + + ``` + +​ +2. Include the `ProtobufNativeSchema.h` in your source code. Ensure the Protobuf dependency has been added to your project. +​ + + ```c++ + + #include + + ``` + +​ +3. Create a producer to send a `User` instance. +​ + + ```c++ + + ProducerConfiguration producerConf; + producerConf.setSchema(createProtobufNativeSchema(User::GetDescriptor())); + Producer producer; + client.createProducer("topic-protobuf", producerConf, producer); + User user; + user.set_name("my-name"); + user.set_age(10); + std::string content; + user.SerializeToString(&content); + producer.send(MessageBuilder().setContent(content).build()); + + ``` + +​ +4. Create a consumer to receive a `User` instance. +​ + + ```c++ + + ConsumerConfiguration consumerConf; + consumerConf.setSchema(createProtobufNativeSchema(User::GetDescriptor())); + consumerConf.setSubscriptionInitialPosition(InitialPositionEarliest); + Consumer consumer; + client.subscribe("topic-protobuf", "my-sub", consumerConf, consumer); + Message msg; + consumer.receive(msg); + User user2; + user2.ParseFromArray(msg.getData(), msg.getLength()); + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-dotnet.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-dotnet.md new file mode 100644 index 0000000000000..fbec5e473be69 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-dotnet.md @@ -0,0 +1,434 @@ +--- +id: client-libraries-dotnet +title: Pulsar C# client +sidebar_label: "C#" +original_id: client-libraries-dotnet +--- + +You can use the Pulsar C# client (DotPulsar) to create Pulsar producers and consumers in C#. All the methods in the producer, consumer, and reader of a C# client are thread-safe. The official documentation for DotPulsar is available [here](https://github.com/apache/pulsar-dotpulsar/wiki). + +## Installation + +You can install the Pulsar C# client library either through the dotnet CLI or through the Visual Studio. This section describes how to install the Pulsar C# client library through the dotnet CLI. For information about how to install the Pulsar C# client library through the Visual Studio, see [here](https://docs.microsoft.com/en-us/visualstudio/mac/nuget-walkthrough?view=vsmac-2019). + +### Prerequisites + +Install the [.NET Core SDK](https://dotnet.microsoft.com/download/), which provides the dotnet command-line tool. Starting in Visual Studio 2017, the dotnet CLI is automatically installed with any .NET Core related workloads. + +### Procedures + +To install the Pulsar C# client library, following these steps: + +1. Create a project. + + 1. Create a folder for the project. + + 2. Open a terminal window and switch to the new folder. + + 3. Create the project using the following command. + + ``` + + dotnet new console + + ``` + + 4. Use `dotnet run` to test that the app has been created properly. + +2. Add the DotPulsar NuGet package. + + 1. Use the following command to install the `DotPulsar` package. + + ``` + + dotnet add package DotPulsar + + ``` + + 2. After the command completes, open the `.csproj` file to see the added reference. + + ```xml + + + + + + ``` + +## Client + +This section describes some configuration examples for the Pulsar C# client. + +### Create client + +This example shows how to create a Pulsar C# client connected to localhost. + +```c# + +var client = PulsarClient.Builder().Build(); + +``` + +To create a Pulsar C# client by using the builder, you can specify the following options. + +| Option | Description | Default | +| ---- | ---- | ---- | +| ServiceUrl | Set the service URL for the Pulsar cluster. | pulsar://localhost:6650 | +| RetryInterval | Set the time to wait before retrying an operation or a reconnection. | 3s | + +### Create producer + +This section describes how to create a producer. + +- Create a producer by using the builder. + + ```c# + + var producer = client.NewProducer() + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a producer without using the builder. + + ```c# + + var options = new ProducerOptions("persistent://public/default/mytopic"); + var producer = client.CreateProducer(options); + + ``` + +### Create consumer + +This section describes how to create a consumer. + +- Create a consumer by using the builder. + + ```c# + + var consumer = client.NewConsumer() + .SubscriptionName("MySubscription") + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a consumer without using the builder. + + ```c# + + var options = new ConsumerOptions("MySubscription", "persistent://public/default/mytopic"); + var consumer = client.CreateConsumer(options); + + ``` + +### Create reader + +This section describes how to create a reader. + +- Create a reader by using the builder. + + ```c# + + var reader = client.NewReader() + .StartMessageId(MessageId.Earliest) + .Topic("persistent://public/default/mytopic") + .Create(); + + ``` + +- Create a reader without using the builder. + + ```c# + + var options = new ReaderOptions(MessageId.Earliest, "persistent://public/default/mytopic"); + var reader = client.CreateReader(options); + + ``` + +### Configure encryption policies + +The Pulsar C# client supports four kinds of encryption policies: + +- `EnforceUnencrypted`: always use unencrypted connections. +- `EnforceEncrypted`: always use encrypted connections) +- `PreferUnencrypted`: use unencrypted connections, if possible. +- `PreferEncrypted`: use encrypted connections, if possible. + +This example shows how to set the `EnforceUnencrypted` encryption policy. + +```c# + +var client = PulsarClient.Builder() + .ConnectionSecurity(EncryptionPolicy.EnforceEncrypted) + .Build(); + +``` + +### Configure authentication + +Currently, the Pulsar C# client supports the TLS (Transport Layer Security) and JWT (JSON Web Token) authentication. + +If you have followed [Authentication using TLS](security-tls-authentication.md), you get a certificate and a key. To use them from the Pulsar C# client, follow these steps: + +1. Create an unencrypted and password-less pfx file. + + ```c# + + openssl pkcs12 -export -keypbe NONE -certpbe NONE -out admin.pfx -inkey admin.key.pem -in admin.cert.pem -passout pass: + + ``` + +2. Use the admin.pfx file to create an X509Certificate2 and pass it to the Pulsar C# client. + + ```c# + + var clientCertificate = new X509Certificate2("admin.pfx"); + var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + + ``` + +## Producer + +A producer is a process that attaches to a topic and publishes messages to a Pulsar broker for processing. This section describes some configuration examples about the producer. + +## Send data + +This example shows how to send data. + +```c# + +var data = Encoding.UTF8.GetBytes("Hello World"); +await producer.Send(data); + +``` + +### Send messages with customized metadata + +- Send messages with customized metadata by using the builder. + + ```c# + + var data = Encoding.UTF8.GetBytes("Hello World"); + var messageId = await producer.NewMessage() + .Property("SomeKey", "SomeValue") + .Send(data); + + ``` + +- Send messages with customized metadata without using the builder. + + ```c# + + var data = Encoding.UTF8.GetBytes("Hello World"); + var metadata = new MessageMetadata(); + metadata["SomeKey"] = "SomeValue"; + var messageId = await producer.Send(metadata, data)); + + ``` + +## Consumer + +A consumer is a process that attaches to a topic through a subscription and then receives messages. This section describes some configuration examples about the consumer. + +### Receive messages + +This example shows how a consumer receives messages from a topic. + +```c# + +await foreach (var message in consumer.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +### Acknowledge messages + +Messages can be acknowledged individually or cumulatively. For details about message acknowledgement, see [acknowledgement](concepts-messaging.md#acknowledgement). + +- Acknowledge messages individually. + + ```c# + + await foreach (var message in consumer.Messages()) + { + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); + } + + ``` + +- Acknowledge messages cumulatively. + + ```c# + + await consumer.AcknowledgeCumulative(message); + + ``` + +### Unsubscribe from topics + +This example shows how a consumer unsubscribes from a topic. + +```c# + +await consumer.Unsubscribe(); + +``` + +#### Note + +> A consumer cannot be used and is disposed once the consumer unsubscribes from a topic. + +## Reader + +A reader is actually just a consumer without a cursor. This means that Pulsar does not keep track of your progress and there is no need to acknowledge messages. + +This example shows how a reader receives messages. + +```c# + +await foreach (var message in reader.Messages()) +{ + Console.WriteLine("Received: " + Encoding.UTF8.GetString(message.Data.ToArray())); +} + +``` + +## Monitoring + +This section describes how to monitor the producer, consumer, and reader state. + +### Monitor producer + +The following table lists states available for the producer. + +| State | Description | +| ---- | ----| +| Closed | The producer or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | + +This example shows how to monitor the producer state. + +```c# + +private static async ValueTask Monitor(IProducer producer, CancellationToken cancellationToken) +{ + var state = ProducerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await producer.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ProducerState.Connected => $"The producer is connected", + ProducerState.Disconnected => $"The producer is disconnected", + ProducerState.Closed => $"The producer has closed", + ProducerState.Faulted => $"The producer has faulted", + _ => $"The producer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (producer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor consumer state + +The following table lists states available for the consumer. + +| State | Description | +| ---- | ----| +| Active | All is well. | +| Inactive | All is well. The subscription type is `Failover` and you are not the active consumer. | +| Closed | The consumer or the Pulsar client has been disposed. | +| Disconnected | The connection is lost and attempts are being made to reconnect. | +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | + +This example shows how to monitor the consumer state. + +```c# + +private static async ValueTask Monitor(IConsumer consumer, CancellationToken cancellationToken) +{ + var state = ConsumerState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await consumer.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ConsumerState.Active => "The consumer is active", + ConsumerState.Inactive => "The consumer is inactive", + ConsumerState.Disconnected => "The consumer is disconnected", + ConsumerState.Closed => "The consumer has closed", + ConsumerState.ReachedEndOfTopic => "The consumer has reached end of topic", + ConsumerState.Faulted => "The consumer has faulted", + _ => $"The consumer has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (consumer.IsFinalState(state)) + return; + } +} + +``` + +### Monitor reader state + +The following table lists states available for the reader. + +| State | Description | +| ---- | ----| +| Closed | The reader or the Pulsar client has been disposed. | +| Connected | All is well. | +| Disconnected | The connection is lost and attempts are being made to reconnect. +| Faulted | An unrecoverable error has occurred. | +| ReachedEndOfTopic | No more messages are delivered. | + +This example shows how to monitor the reader state. + +```c# + +private static async ValueTask Monitor(IReader reader, CancellationToken cancellationToken) +{ + var state = ReaderState.Disconnected; + + while (!cancellationToken.IsCancellationRequested) + { + state = await reader.StateChangedFrom(state, cancellationToken); + + var stateMessage = state switch + { + ReaderState.Connected => "The reader is connected", + ReaderState.Disconnected => "The reader is disconnected", + ReaderState.Closed => "The reader has closed", + ReaderState.ReachedEndOfTopic => "The reader has reached end of topic", + ReaderState.Faulted => "The reader has faulted", + _ => $"The reader has an unknown state '{state}'" + }; + + Console.WriteLine(stateMessage); + + if (reader.IsFinalState(state)) + return; + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-go.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-go.md new file mode 100644 index 0000000000000..d35738fce86f0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-go.md @@ -0,0 +1,885 @@ +--- +id: client-libraries-go +title: Pulsar Go client +sidebar_label: "Go" +original_id: client-libraries-go +--- + +> Tips: Currently, the CGo client will be deprecated, if you want to know more about the CGo client, please refer to [CGo client docs](client-libraries-cgo.md) + +You can use Pulsar [Go client](https://github.com/apache/pulsar-client-go) to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +> **API docs available as well** +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar-client-go/pulsar). + + +## Installation + +### Install go package + +You can install the `pulsar` library locally using `go get`. + +```bash + +$ go get -u "github.com/apache/pulsar-client-go/pulsar" + +``` + +Once installed locally, you can import it into your project: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, you can set the URL as below. + +``` + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + +```go + +import ( + "log" + "time" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +If you have multiple brokers, you can initiate a client object as below. + +```go + +import ( + "log" + "time" + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650,localhost:6651,localhost:6652", + OperationTimeout: 30 * time.Second, + ConnectionTimeout: 30 * time.Second, + }) + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } + + defer client.Close() +} + +``` + +The following configurable parameters are available for Pulsar clients: + + Name | Description | Default +| :-------- | :---------- |:---------- | +| URL | Configure the service URL for the Pulsar service.

    If you have multiple brokers, you can set multiple Pulsar cluster addresses for a client.

    This parameter is **required**. |None | +| ConnectionTimeout | Timeout for the establishment of a TCP connection | 30s | +| OperationTimeout| Set the operation timeout. Producer-create, subscribe and unsubscribe operations will be retried until this interval, after which the operation will be marked as failed| 30s| +| Authentication | Configure the authentication provider. Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | no authentication | +| TLSTrustCertsFilePath | Set the path to the trusted TLS certificate file | | +| TLSAllowInsecureConnection | Configure whether the Pulsar client accept untrusted TLS certificate from broker | false | +| TLSValidateHostname | Configure whether the Pulsar client verify the validity of the host name from broker | false | +| ListenerName | Configure the net model for VPC users to connect to the Pulsar broker | | +| MaxConnectionsPerBroker | Max number of connections to a single broker that is kept in the pool | 1 | +| CustomMetricsLabels | Add custom labels to all the metrics reported by this client instance | | +| Logger | Configure the logger used by the client | logrus.StandardLogger | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatal(err) +} + +_, err = producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte("hello"), +}) + +defer producer.Close() + +if err != nil { + fmt.Println("Failed to publish message", err) +} +fmt.Println("Published message") + +``` + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, *ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | (MessageID, error) +`SendAsync(context.Context, *ProducerMessage, func(MessageID, *ProducerMessage, error))`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | + +### Producer Example + +#### How to use message router in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: serviceURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +// Only subscribe on the specific partition +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-partitioned-topic-partition-2", + SubscriptionName: "my-sub", +}) + +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-partitioned-topic", + MessageRouter: func(msg *ProducerMessage, tm TopicMetadata) int { + fmt.Println("Routing message ", msg, " -- Partitions: ", tm.NumPartitions()) + return 2 + }, +}) + +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +``` + +#### How to use schema interface in producer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +properties := make(map[string]string) +properties["pulsar"] = "hello" +jsonSchemaWithProperties := NewJSONSchema(exampleSchemaDef, properties) +producer, err := client.CreateProducer(ProducerOptions{ + Topic: "jsonTopic", + Schema: jsonSchemaWithProperties, +}) +assert.Nil(t, err) + +_, err = producer.Send(context.Background(), &ProducerMessage{ + Value: &testJSON{ + ID: 100, + Name: "pulsar", + }, +}) +if err != nil { + log.Fatal(err) +} +producer.Close() + +``` + +#### How to use delay relative in producer + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topicName := newTopicName() +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicName, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topicName, + SubscriptionName: "subName", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +ID, err := producer.Send(context.Background(), &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("test")), + DeliverAfter: 3 * time.Second, +}) +if err != nil { + log.Fatal(err) +} +fmt.Println(ID) + +ctx, canc := context.WithTimeout(context.Background(), 1*time.Second) +msg, err := consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +ctx, canc = context.WithTimeout(context.Background(), 5*time.Second) +msg, err = consumer.Receive(ctx) +if err != nil { + log.Fatal(err) +} +fmt.Println(msg.Payload()) +canc() + +``` + +### Producer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name specify a name for the producer. If not assigned, the system will generate a globally unique name which can be access with Producer.ProducerName(). | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| SendTimeout | SendTimeout set the timeout for a message that is not acknowledged by the server | 30s | +| DisableBlockIfQueueFull | DisableBlockIfQueueFull control whether Send and SendAsync block if producer's message queue is full | false | +| MaxPendingMessages| MaxPendingMessages set the max size of the queue holding the messages pending to receive an acknowledgment from the broker. | | +| HashingScheme | HashingScheme change the `HashingScheme` used to chose the partition on where to publish a particular message. | JavaStringHash | +| CompressionType | CompressionType set the compression type for the producer. | not compressed | +| CompressionLevel | Define the desired compression level. Options: Default, Faster and Better | Default | +| MessageRouter | MessageRouter set a custom message routing policy by passing an implementation of MessageRouter | | +| DisableBatching | DisableBatching control whether automatic batching of messages is enabled for the producer. | false | +| BatchingMaxPublishDelay | BatchingMaxPublishDelay set the time period within which the messages sent will be batched | 1ms | +| BatchingMaxMessages | BatchingMaxMessages set the maximum number of messages permitted in a batch. | 1000 | +| BatchingMaxSize | BatchingMaxSize sets the maximum number of bytes permitted in a batch. | 128KB | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ProducerInterceptor` interface. | None | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker | ultimate | +| BatcherBuilderType | BatcherBuilderType sets the batch builder type. This is used to create a batch container when batching is enabled. Options: DefaultBatchBuilder and KeyBasedBatchBuilder | DefaultBatchBuilder | + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go + +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-sub", + Type: pulsar.Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +for i := 0; i < 10; i++ { + msg, err := consumer.Receive(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) +} + +if err := consumer.Unsubscribe(); err != nil { + log.Fatal(err) +} + +``` + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Chan()` | Chan returns a channel from which to consume messages. | `<-chan ConsumerMessage` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | +`ReconsumeLater(msg Message, delay time.Duration)` | ReconsumeLater mark a message for redelivery after custom delay | +`Nack(Message)` | Acknowledge the failure to process a single message. | +`NackID(MessageID)` | Acknowledge the failure to process a single message. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this consumer to a specific message publish time. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | +`Name()` | Name returns the name of consumer | `string` + +### Receive example + +#### How to use regex consumer + +```go + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) + +defer client.Close() + +p, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topicInRegex, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer p.Close() + +topicsPattern := fmt.Sprintf("persistent://%s/foo.*", namespace) +opts := pulsar.ConsumerOptions{ + TopicsPattern: topicsPattern, + SubscriptionName: "regex-sub", +} +consumer, err := client.Subscribe(opts) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use multi topics Consumer + +```go + +func newTopicName() string { + return fmt.Sprintf("my-topic-%v", time.Now().Nanosecond()) +} + + +topic1 := "topic-1" +topic2 := "topic-2" + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +topics := []string{topic1, topic2} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topics: topics, + SubscriptionName: "multi-topic-sub", +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +``` + +#### How to use consumer listener + +```go + +import ( + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + channel := make(chan pulsar.ConsumerMessage, 100) + + options := pulsar.ConsumerOptions{ + Topic: "topic-1", + SubscriptionName: "my-subscription", + Type: pulsar.Shared, + } + + options.MessageChannel = channel + + consumer, err := client.Subscribe(options) + if err != nil { + log.Fatal(err) + } + + defer consumer.Close() + + // Receive messages from channel. The channel returns a struct which contains message and the consumer from where + // the message was received. It's not necessary here since we have 1 single consumer, but the channel could be + // shared across multiple consumers as well + for cm := range channel { + msg := cm.Message + fmt.Printf("Received message msgId: %v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + + consumer.Ack(msg) + } +} + +``` + +#### How to use consumer receive timeout + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "test-topic-with-no-messages" +ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) +defer cancel() + +// create consumer +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: topic, + SubscriptionName: "my-sub1", + Type: Shared, +}) +if err != nil { + log.Fatal(err) +} +defer consumer.Close() + +msg, err := consumer.Receive(ctx) +fmt.Println(msg.Payload()) +if err != nil { + log.Fatal(err) +} + +``` + +#### How to use schema in consumer + +```go + +type testJSON struct { + ID int `json:"id"` + Name string `json:"name"` +} + +``` + +```go + +var ( + exampleSchemaDef = "{\"type\":\"record\",\"name\":\"Example\",\"namespace\":\"test\"," + + "\"fields\":[{\"name\":\"ID\",\"type\":\"int\"},{\"name\":\"Name\",\"type\":\"string\"}]}" +) + +``` + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +defer client.Close() + +var s testJSON + +consumerJS := NewJSONSchema(exampleSchemaDef, nil) +consumer, err := client.Subscribe(ConsumerOptions{ + Topic: "jsonTopic", + SubscriptionName: "sub-1", + Schema: consumerJS, + SubscriptionInitialPosition: SubscriptionPositionEarliest, +}) +assert.Nil(t, err) +msg, err := consumer.Receive(context.Background()) +assert.Nil(t, err) +err = msg.GetSchemaValue(&s) +if err != nil { + log.Fatal(err) +} + +defer consumer.Close() + +``` + +### Consumer configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Topics | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing| | +| TopicsPattern | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | | +| AutoDiscoveryPeriod | Specify the interval in which to poll for new partitions or new topics if using a TopicsPattern. | | +| SubscriptionName | Specify the subscription name for this consumer. This argument is required when subscribing | | +| Name | Set the consumer name | | +| Properties | Properties attach a set of application defined properties to the producer This properties will be visible in the topic stats | | +| Type | Select the subscription type to be used when subscribing to the topic. | Exclusive | +| SubscriptionInitialPosition | InitialPosition at which the cursor will be set when subscribe | Latest | +| DLQ | Configuration for Dead Letter Queue consumer policy. | no DLQ | +| MessageChannel | Sets a `MessageChannel` for the consumer. When a message is received, it will be pushed to the channel for consumption | | +| ReceiverQueueSize | Sets the size of the consumer receive queue. | 1000| +| NackRedeliveryDelay | The delay after which to redeliver the messages that failed to be processed | 1min | +| ReadCompacted | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic | false | +| ReplicateSubscriptionState | Mark the subscription as replicated to keep it in sync across clusters | false | +| KeySharedPolicy | Configuration for Key Shared consumer policy. | | +| RetryEnable | Auto retry send messages to default filled DLQPolicy topics | false | +| Interceptors | A chain of interceptors. These interceptors are called at some points defined in the `ConsumerInterceptor` interface. | | +| MaxReconnectToBroker | MaxReconnectToBroker set the maximum retry number of reconnectToBroker. | ultimate | +| Schema | Schema set a custom schema type by passing an implementation of `Schema` | bytes[] | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), +}) +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +``` + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` +`Seek(MessageID)` | Reset the subscription associated with this reader to a specific message ID | `error` +`SeekByTime(time time.Time)` | Reset the subscription associated with this reader to a specific message publish time | `error` + +### Reader example + +#### How to use reader to read 'next' message + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go + +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{URL: "pulsar://localhost:6650"}) + if err != nil { + log.Fatal(err) + } + + defer client.Close() + + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "topic-1", + StartMessageID: pulsar.EarliestMessageID(), + }) + if err != nil { + log.Fatal(err) + } + defer reader.Close() + + for reader.HasNext() { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) + } + + fmt.Printf("Received message msgId: %#v -- content: '%s'\n", + msg.ID(), string(msg.Payload())) + } +} + +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go + +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.DeserializeMessageID(lastSavedId), +}) + +``` + +#### How to use reader to read specific message + +```go + +client, err := NewClient(pulsar.ClientOptions{ + URL: lookupURL, +}) + +if err != nil { + log.Fatal(err) +} +defer client.Close() + +topic := "topic-1" +ctx := context.Background() + +// create producer +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: topic, + DisableBatching: true, +}) +if err != nil { + log.Fatal(err) +} +defer producer.Close() + +// send 10 messages +msgIDs := [10]MessageID{} +for i := 0; i < 10; i++ { + msgID, err := producer.Send(ctx, &pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("hello-%d", i)), + }) + assert.NoError(t, err) + assert.NotNil(t, msgID) + msgIDs[i] = msgID +} + +// create reader on 5th message (not included) +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], +}) + +if err != nil { + log.Fatal(err) +} +defer reader.Close() + +// receive the remaining 5 messages +for i := 5; i < 10; i++ { + msg, err := reader.Next(context.Background()) + if err != nil { + log.Fatal(err) +} + +// create reader on 5th message (included) +readerInclusive, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: topic, + StartMessageID: msgIDs[4], + StartMessageIDInclusive: true, +}) + +if err != nil { + log.Fatal(err) +} +defer readerInclusive.Close() + +``` + +### Reader configuration + + Name | Description | Default +| :-------- | :---------- |:---------- | +| Topic | Topic specify the topic this consumer will subscribe to. This argument is required when constructing the reader. | | +| Name | Name set the reader name. | | +| Properties | Attach a set of application defined properties to the reader. This properties will be visible in the topic stats | | +| StartMessageID | StartMessageID initial reader positioning is done by specifying a message id. | | +| StartMessageIDInclusive | If true, the reader will start at the `StartMessageID`, included. Default is `false` and the reader will start from the "next" message | false | +| MessageChannel | MessageChannel sets a `MessageChannel` for the consumer When a message is received, it will be pushed to the channel for consumption| | +| ReceiverQueueSize | ReceiverQueueSize sets the size of the consumer receive queue. | 1000 | +| SubscriptionRolePrefix| SubscriptionRolePrefix set the subscription role prefix. | “reader” | +| ReadCompacted | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. ReadCompacted can only be enabled when reading from a persistent topic. | false| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go + +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if _, err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} + +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`OrderingKey` | OrderingKey sets the ordering key of the message. +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message +`DeliverAfter` | Request to deliver the message only after the specified relative delay +`DeliverAt` | Deliver the message only at or after the specified absolute timestamp + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go + +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} + +``` + +## OAuth2 authentication + +To use [OAuth2 authentication](security-oauth2.md), you'll need to configure your client to perform the following operations. +This example shows how to configure OAuth2 authentication. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-java.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-java.md new file mode 100644 index 0000000000000..067a3a10de1ed --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-java.md @@ -0,0 +1,1038 @@ +--- +id: client-libraries-java +title: Pulsar Java client +sidebar_label: "Java" +original_id: client-libraries-java +--- + +You can use a Pulsar Java client to create the Java [producer](#producer), [consumer](#consumer), and [readers](#reader) of messages and to perform [administrative tasks](admin-api-overview.md). The current Java client version is **@pulsar:version@**. + +All the methods in [producer](#producer), [consumer](#consumer), and [reader](#reader) of a Java client are thread-safe. + +Javadoc for the Pulsar client is divided into two domains by package as follows. + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | The producer and consumer API | [org.apache.pulsar:pulsar-client:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7C@pulsar:version@%7Cjar) +`org.apache.pulsar.client.all` |Include both `pulsar-client` and `pulsar-client-admin`
    Both `pulsar-client` and `pulsar-client-admin` are shaded packages and they shade dependencies independently. Consequently, the applications using both `pulsar-client` and `pulsar-client-admin` have redundant shaded classes. It would be troublesome if you introduce new dependencies but forget to update shading rules.
    In this case, you can use `pulsar-client-all`, which shades dependencies only one time and reduces the size of dependencies. |[org.apache.pulsar:pulsar-client-all:@pulsar:version@](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-all%7C@pulsar:version@%7Cjar) + +This document focuses only on the client API for producing and consuming messages on Pulsar topics. For how to use the Java admin client, see [Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C@pulsar:version@%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +### Maven + +If you use Maven, add the following information to the `pom.xml` file. + +```xml + + +@pulsar:version@ + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + + +``` + +### Gradle + +If you use Gradle, add the following information to the `build.gradle` file. + +```groovy + +def pulsarVersion = '@pulsar:version@' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} + +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +You can assign Pulsar protocol URLs to specific clusters and use the `pulsar` scheme. The default port is `6650`. The following is an example of `localhost`. + +```http + +pulsar://localhost:6650 + +``` + +If you have multiple brokers, the URL is as follows. + +```http + +pulsar://localhost:6550,localhost:6651,localhost:6652 + +``` + +A URL for a production Pulsar cluster is as follows. + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you use [TLS](security-tls-authentication.md) authentication, the URL is as follows. + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Client + +You can instantiate a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object using just a URL for the target Pulsar [cluster](reference-terminology.md#cluster) like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +``` + +If you have multiple brokers, you can initiate a PulsarClient like this: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650,localhost:6651,localhost:6652") + .build(); + +``` + +> ### Default broker URLs for standalone clusters +> If you run a cluster in [standalone mode](getting-started-standalone.md), the broker is available at the `pulsar://localhost:6650` URL by default. + +If you create a client, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Name | Type |
    Description
    | Default +|---|---|---|--- +`serviceUrl` | String | Service URL provider for Pulsar service | None +`authPluginClassName` | String | Name of the authentication plugin | None + `authParams` | String | Parameters for the authentication plugin

    **Example**
    key1:val1,key2:val2|None +`operationTimeoutMs`|long|`operationTimeoutMs`|Operation timeout |30000 +`statsIntervalSeconds`|long|Interval between each stats information

    Stats is activated with positive `statsInterval`

    Set `statsIntervalSeconds` to 1 second at least. |60 +`numIoThreads`| int| The number of threads used for handling connections to brokers | 1 +`numListenerThreads`|int|The number of threads used for handling message listeners. The listener thread pool is shared across all the consumers and readers using the "listener" model to get messages. For a given consumer, the listener is always invoked from the same thread to ensure ordering. If you want multiple threads to process a single topic, you need to create a [`shared`](https://pulsar.apache.org/docs/en/next/concepts-messaging/#shared) subscription and multiple consumers for this subscription. This does not ensure ordering.| 1 +`useTcpNoDelay`| boolean| Whether to use TCP no-delay flag on the connection to disable Nagle algorithm |true +`enableTls` |boolean | Whether to use TLS encryption on the connection. Note that this parameter is **deprecated**. If you want to enable TLS, use `pulsar+ssl://` in `serviceUrl` instead. | false + `tlsTrustCertsFilePath` |string |Path to the trusted TLS certificate file|None +`tlsAllowInsecureConnection`|boolean|Whether the Pulsar client accepts untrusted TLS certificate from broker | false +`tlsHostnameVerificationEnable` |boolean | Whether to enable TLS hostname verification|false +`concurrentLookupRequest`|int|The number of concurrent lookup requests allowed to send on each broker connection to prevent overload on broker|5000 +`maxLookupRequest`|int|The maximum number of lookup requests allowed on each broker connection to prevent overload on broker | 50000 +`maxNumberOfRejectedRequestPerConnection`|int|The maximum number of rejected requests of a broker in a certain time frame (30 seconds) after the current connection is closed and the client creates a new connection to connect to a different broker|50 +`keepAliveIntervalSeconds`|int|Seconds of keeping alive interval for each client broker connection|30 +`connectionTimeoutMs`|int|Duration of waiting for a connection to a broker to be established

    If the duration passes without a response from a broker, the connection attempt is dropped|10000 +`requestTimeoutMs`|int|Maximum duration for completing a request |60000 +`defaultBackoffIntervalNanos`|int| Default duration for a backoff interval | TimeUnit.MILLISECONDS.toNanos(100); +`maxBackoffIntervalNanos`|long|Maximum duration for a backoff interval|TimeUnit.SECONDS.toNanos(30) +`socks5ProxyAddress`|SocketAddress|SOCKS5 proxy address | None +`socks5ProxyUsername`|string|SOCKS5 proxy username | None +`socks5ProxyPassword`|string|SOCKS5 proxy password | None + +Check out the Javadoc for the {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} class for a full list of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configure-producer) and [consumer](#configure-consumer) specific configuration as described in sections below. + +## Producer + +In Pulsar, producers write messages to topics. Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object (as in the section [above](#client-configuration)), you can create a {@inject: javadoc:Producer:/client/org/apache/pulsar/client/api/Producer} for a specific Pulsar [topic](reference-terminology.md#topic). + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); + +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types by specifying a message [schema](#schema). + +```java + +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); + +``` + +> Make sure that you close your producers, consumers, and clients when you do not need them. + +> ```java +> +> producer.close(); +> consumer.close(); +> client.close(); +> +> +> ``` + +> +> Close operations can also be asynchronous: + +> ```java +> +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")) +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return null; +> }); +> +> +> ``` + + +### Configure producer + +If you instantiate a `Producer` object by specifying only a topic name as the example above, the default configuration of producer is used. + +If you create a producer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Name| Type |
    Description
    | Default +|---|---|---|--- +`topicName`| string| Topic name| null| +`producerName`| string|Producer name| null +`sendTimeoutMs`| long|Message send timeout in ms.
    If a message is not acknowledged by a server before the `sendTimeout` expires, an error occurs.|30000 +`blockIfQueueFull`|boolean|If it is set to `true`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer block, rather than failing and throwing errors.
    If it is set to `false`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer fail and `ProducerQueueIsFullError` exceptions occur.

    The `MaxPendingMessages` parameter determines the size of the outgoing message queue.|false +`maxPendingMessages`| int|The maximum size of a queue holding pending messages.

    For example, a message waiting to receive an acknowledgment from a [broker](reference-terminology.md#broker).

    By default, when the queue is full, all calls to the `Send` and `SendAsync` methods fail **unless** you set `BlockIfQueueFull` to `true`.|1000 +`maxPendingMessagesAcrossPartitions`|int|The maximum number of pending messages across partitions.

    Use the setting to lower the max pending messages for each partition ({@link #setMaxPendingMessages(int)}) if the total number exceeds the configured value.|50000 +`messageRoutingMode`| MessageRoutingMode|Message routing logic for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics).
    Apply the logic only when setting no key on messages.
    Available options are as follows:
  • `pulsar.RoundRobinDistribution`: round robin
  • `pulsar.UseSinglePartition`: publish all messages to a single partition
  • `pulsar.CustomPartition`: a custom partitioning scheme
  • |
  • `pulsar.RoundRobinDistribution`
  • +`hashingScheme`| HashingScheme|Hashing function determining the partition where you publish a particular message (**partitioned topics only**).
    Available options are as follows:
  • `pulsar.JavastringHash`: the equivalent of `string.hashCode()` in Java
  • `pulsar.Murmur3_32Hash`: applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function
  • `pulsar.BoostHash`: applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library
  • |`HashingScheme.JavastringHash` +`cryptoFailureAction`| ProducerCryptoFailureAction|Producer should take action when encryption fails.
  • **FAIL**: if encryption fails, unencrypted messages fail to send.
  • **SEND**: if encryption fails, unencrypted messages are sent.
  • |`ProducerCryptoFailureAction.FAIL` +`batchingMaxPublishDelayMicros`| long|Batching time period of sending messages.|TimeUnit.MILLISECONDS.toMicros(1) +`batchingMaxMessages` |int|The maximum number of messages permitted in a batch.|1000 +`batchingEnabled`| boolean|Enable batching of messages. |true +`compressionType`|CompressionType|Message data compression type used by a producer.
    Available options:
  • [`LZ4`](https://github.com/lz4/lz4)
  • [`ZLIB`](https://zlib.net/)
  • [`ZSTD`](https://facebook.github.io/zstd/)
  • [`SNAPPY`](https://google.github.io/snappy/)
  • | No compression + +You can configure parameters if you do not want to use the default configuration. + +For a full list, see the Javadoc for the {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder} class. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); + +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more information on specifying a routing mode using the Java client, see the [Partitioned Topics cookbook](cookbooks-partitioned.md). + +### Async send + +You can publish messages [asynchronously](concepts-messaging.md#send-modes) using the Java client. With async send, the producer puts the message in a blocking queue and returns it immediately. Then the client library sends the message to the broker in the background. If the queue is full (max size configurable), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. + +The following is an example. + +```java + +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.println("Message with ID " + msgId + " successfully sent"); +}); + +``` + +As you can see from the example above, async send operations return a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configure messages + +In addition to a value, you can set additional items on a given message: + +```java + +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); + +``` + +You can terminate the builder chain with `sendAsync()` and get a future return. + +## Consumer + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new [consumer](reference-terminology.md#consumer) by first instantiating a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object, you can create a {@inject: javadoc:Consumer:/client/org/apache/pulsar/client/api/Consumer} by specifying a [topic](reference-terminology.md#topic) and a [subscription](concepts-messaging.md#subscription-modes). + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); + +``` + +The `subscribe` method will auto subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any received message, and then [acknowledges](reference-terminology.md#acknowledgment-ack) that the message has been processed. If the processing logic fails, you can use [negative acknowledgement](reference-terminology.md#acknowledgment-ack) to redeliver the message later. + +```java + +while (true) { + // Wait for a message + Message msg = consumer.receive(); + + try { + // Do something with the message + System.out.println("Message received: " + new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); + } catch (Exception e) { + // Message failed to process, redeliver later + consumer.negativeAcknowledge(msg); + } +} + +``` + +If you don't want to block your main thread and rather listen constantly for new messages, consider using a `MessageListener`. + +```java + +MessageListener myMessageListener = (consumer, msg) -> { + try { + System.out.println("Message received: " + new String(msg.getData())); + consumer.acknowledge(msg); + } catch (Exception e) { + consumer.negativeAcknowledge(msg); + } +} + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .messageListener(myMessageListener) + .subscribe(); + +``` + +### Configure consumer + +If you instantiate a `Consumer` object by specifying only a topic and subscription name as in the example above, the consumer uses the default configuration. + +When you create a consumer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + + Name|Type |
    Description
    | Default +|---|---|---|--- +`topicNames`| Set<String>| Topic name| Sets.newTreeSet() + `topicsPattern`|Pattern| Topic pattern |None +`subscriptionName`|String| Subscription name| None +`subscriptionType`|SubscriptionType| Subscription type
    Four subscription types are available:
  • Exclusive
  • Failover
  • Shared
  • Key_Shared
  • |SubscriptionType.Exclusive +`receiverQueueSize` |int | Size of a consumer's receiver queue.

    For example, the number of messages accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.| 1000 +`acknowledgementsGroupTimeMicros`|long|Group a consumer acknowledgment for a specified time.

    By default, a consumer uses 100ms grouping time to send out acknowledgments to a broker.

    Setting a group time of 0 sends out acknowledgments immediately.

    A longer ack group time is more efficient at the expense of a slight increase in message re-deliveries after a failure.|TimeUnit.MILLISECONDS.toMicros(100) +`negativeAckRedeliveryDelayMicros`|long|Delay to wait before redelivering messages that failed to be processed.

    When an application uses {@link Consumer#negativeAcknowledge(Message)}, failed messages are redelivered after a fixed timeout. |TimeUnit.MINUTES.toMicros(1) +`maxTotalReceiverQueueSizeAcrossPartitions`|int |The max total receiver queue size across partitions.

    This setting reduces the receiver queue size for individual partitions if the total receiver queue size exceeds this value.|50000 +`consumerName`|String|Consumer name|null +`ackTimeoutMillis`|long|Timeout of unacked messages|0 +`tickDurationMillis`|long|Granularity of the ack-timeout redelivery.

    Using an higher `tickDurationMillis` reduces the memory overhead to track messages when setting ack-timeout to a bigger value (for example, 1 hour).|1000 +`priorityLevel`|int|Priority level for a consumer to which a broker gives more priority while dispatching messages in the shared subscription mode.

    The broker follows descending priorities. For example, 0=max-priority, 1, 2,...

    In shared subscription mode, the broker **first dispatches messages to the max priority level consumers if they have permits**. Otherwise, the broker considers next priority level consumers.

    **Example 1**
    If a subscription has consumerA with `priorityLevel` 0 and consumerB with `priorityLevel` 1, then the broker **only dispatches messages to consumerA until it runs out permits** and then starts dispatching messages to consumerB.

    **Example 2**
    Consumer Priority, Level, Permits
    C1, 0, 2
    C2, 0, 1
    C3, 0, 1
    C4, 1, 2
    C5, 1, 1

    Order in which a broker dispatches messages to consumers is: C1, C2, C3, C1, C4, C5, C4.|0 +`cryptoFailureAction`|ConsumerCryptoFailureAction|Consumer should take action when it receives a message that can not be decrypted.
  • **FAIL**: this is the default option to fail messages until crypto succeeds.
  • **DISCARD**:silently acknowledge and not deliver message to an application.
  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

  • The decompression of message fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|
  • ConsumerCryptoFailureAction.FAIL
  • +`properties`|SortedMap|A name or value property of this consumer.

    `properties` is application defined metadata attached to a consumer.

    When getting a topic stats, associate this metadata with the consumer stats for easier identification.|new TreeMap() +`readCompacted`|boolean|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    Only enabling `readCompacted` on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +`subscriptionInitialPosition`|SubscriptionInitialPosition|Initial position at which to set cursor when subscribing to a topic at first time.|SubscriptionInitialPosition.Latest +`patternAutoDiscoveryPeriod`|int|Topic auto discovery period when using a pattern for topic's consumer.

    The default and minimum value is 1 minute.|1 +`regexSubscriptionMode`|RegexSubscriptionMode|When subscribing to a topic using a regular expression, you can pick a certain type of topics.

  • **PersistentOnly**: only subscribe to persistent topics.
  • **NonPersistentOnly**: only subscribe to non-persistent topics.
  • **AllTopics**: subscribe to both persistent and non-persistent topics.
  • |RegexSubscriptionMode.PersistentOnly +`deadLetterPolicy`|DeadLetterPolicy|Dead letter policy for consumers.

    By default, some messages are probably redelivered many times, even to the extent that it never stops.

    By using the dead letter mechanism, messages have the max redelivery count. **When exceeding the maximum number of redeliveries, messages are sent to the Dead Letter Topic and acknowledged automatically**.

    You can enable the dead letter mechanism by setting `deadLetterPolicy`.

    **Example**

    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10).build())
    .subscribe();


    Default dead letter topic name is `{TopicName}-{Subscription}-DLQ`.

    To set a custom dead letter topic name:
    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10)
    .deadLetterTopic("your-topic-name").build())
    .subscribe();


    When specifying the dead letter policy while not specifying `ackTimeoutMillis`, you can set the ack timeout to 30000 millisecond.|None +`autoUpdatePartitions`|boolean|If `autoUpdatePartitions` is enabled, a consumer subscribes to partition increasement automatically.

    **Note**: this is only for partitioned consumers.|true +`replicateSubscriptionState`|boolean|If `replicateSubscriptionState` is enabled, a subscription state is replicated to geo-replicated clusters.|false + +You can configure parameters if you do not want to use the default configuration. For a full list, see the Javadoc for the {@inject: javadoc:ConsumerBuilder:/client/org/apache/pulsar/client/api/ConsumerBuilder} class. + +The following is an example. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); + +``` + +### Async receive + +The `receive` method receives messages synchronously (the consumer process is blocked until a message is available). You can also use [async receive](concepts-messaging.md#receive-modes), which returns a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object immediately once a new message is available. + +The following is an example. + +```java + +CompletableFuture asyncMessage = consumer.receiveAsync(); + +``` + +Async receive operations return a {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Batch receive + +Use `batchReceive` to receive multiple messages for each call. + +The following is an example. + +```java + +Messages messages = consumer.batchReceive(); +for (Object message : messages) { + // do something +} +consumer.acknowledge(messages) + +``` + +:::note + +Batch receive policy limits the number and bytes of messages in a single batch. You can specify a timeout to wait for enough messages. +The batch receive is completed if any of the following condition is met: enough number of messages, bytes of messages, wait timeout. + +```java + +Consumer consumer = client.newConsumer() +.topic("my-topic") +.subscriptionName("my-subscription") +.batchReceivePolicy(BatchReceivePolicy.builder() +.maxNumMessages(100) +.maxNumBytes(1024 * 1024) +.timeout(200, TimeUnit.MILLISECONDS) +.build()) +.subscribe(); + +``` + +The default batch receive policy is: + +```java + +BatchReceivePolicy.builder() +.maxNumMessage(-1) +.maxNumBytes(10 * 1024 * 1024) +.timeout(100, TimeUnit.MILLISECONDS) +.build(); + +``` + +::: + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](concepts-messaging.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The followings are some examples. + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); + +``` + +In the above example, the consumer subscribes to the `persistent` topics that can match the topic name pattern. If you want the consumer subscribes to all `persistent` and `non-persistent` topics that can match the topic name pattern, set `subscriptionTopicsMode` to `RegexSubscriptionMode.AllTopics`. + +```java + +Pattern pattern = Pattern.compile("public/default/.*"); +pulsarClient.newConsumer() + .subscriptionName("my-sub") + .topicsPattern(pattern) + .subscriptionTopicsMode(RegexSubscriptionMode.AllTopics) + .subscribe(); + +``` + +:::note + +By default, the `subscriptionTopicsMode` of the consumer is `PersistentOnly`. Available options of `subscriptionTopicsMode` are `PersistentOnly`, `NonPersistentOnly`, and `AllTopics`. + +::: + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java + +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topic( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); + +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. The following is an example. + +```java + +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(this::receiveMessageFromConsumer); + +private void receiveMessageFromConsumer(Object consumer) { + ((Consumer)consumer).receiveAsync().thenAccept(message -> { + // Do something with the received message + receiveMessageFromConsumer(consumer); + }); +} + +``` + +### Subscription modes + +Pulsar has various [subscription modes](concepts-messaging#subscription-modes) to match different scenarios. A topic can have multiple subscriptions with different subscription modes. However, a subscription can only have one subscription mode at a time. + +A subscription is identical with the subscription name which can specify only one subscription mode at a time. You cannot change the subscription mode unless all existing consumers of this subscription are offline. + +Different subscription modes have different message distribution modes. This section describes the differences of subscription modes and how to use them. + +In order to better describe their differences, assuming you have a topic named "my-topic", and the producer has published 10 messages. + +```java + +Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .enableBatching(false) + .create(); +// 3 messages with "key-1", 3 messages with "key-2", 2 messages with "key-3" and 2 messages with "key-4" +producer.newMessage().key("key-1").value("message-1-1").send(); +producer.newMessage().key("key-1").value("message-1-2").send(); +producer.newMessage().key("key-1").value("message-1-3").send(); +producer.newMessage().key("key-2").value("message-2-1").send(); +producer.newMessage().key("key-2").value("message-2-2").send(); +producer.newMessage().key("key-2").value("message-2-3").send(); +producer.newMessage().key("key-3").value("message-3-1").send(); +producer.newMessage().key("key-3").value("message-3-2").send(); +producer.newMessage().key("key-4").value("message-4-1").send(); +producer.newMessage().key("key-4").value("message-4-2").send(); + +``` + +#### Exclusive + +Create a new consumer and subscribe with the `Exclusive` subscription mode. + +```java + +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Exclusive) + .subscribe() + +``` + +Only the first consumer is allowed to the subscription, other consumers receive an error. The first consumer receives all 10 messages, and the consuming order is the same as the producing order. + +:::note + +If topic is a partitioned topic, the first consumer subscribes to all partitioned topics, other consumers are not assigned with partitions and receive an error. + +::: + +#### Failover + +Create new consumers and subscribe with the`Failover` subscription mode. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +//conumser1 is the active consumer, consumer2 is the standby consumer. +//consumer1 receives 5 messages and then crashes, consumer2 takes over as an active consumer. + +``` + +Multiple consumers can attach to the same subscription, yet only the first consumer is active, and others are standby. When the active consumer is disconnected, messages will be dispatched to one of standby consumers, and the standby consumer then becomes active consumer. + +If the first active consumer is disconnected after receiving 5 messages, the standby consumer becomes active consumer. Consumer1 will receive: + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-2", "message-2-1") +("key-2", "message-2-2") + +``` + +consumer2 will receive: + +``` + +("key-2", "message-2-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +:::note + +If a topic is a partitioned topic, each partition has only one active consumer, messages of one partition are distributed to only one consumer, and messages of multiple partitions are distributed to multiple consumers. + +::: + +#### Shared + +Create new consumers and subscribe with `Shared` subscription mode: + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +In shared subscription mode, multiple consumers can attach to the same subscription and messages are delivered in a round robin distribution across consumers. + +If a broker dispatches only one message at a time, consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-3") +("key-2", "message-2-2") +("key-3", "message-3-1") +("key-4", "message-4-1") + +``` + +consumer2 receives the following information. + +``` + +("key-1", "message-1-2") +("key-2", "message-2-1") +("key-2", "message-2-3") +("key-3", "message-3-2") +("key-4", "message-4-2") + +``` + +`Shared` subscription is different from `Exclusive` and `Failover` subscription modes. `Shared` subscription has better flexibility, but cannot provide order guarantee. + +#### Key_shared + +This is a new subscription mode since 2.4.0 release, create new consumers and subscribe with `Key_Shared` subscription mode. + +```java + +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. + +``` + +Just like in the `Shared` subscription, all consumers in the `Key_Shared` subscription mode can attach to the same subscription. But the `Key_Shared` subscription mode is different from the `Shared` subscription. In the `Key_Shared` subscription mode, messages with the same key are delivered to only one consumer in order. The possible distribution of messages between different consumers (by default we do not know in advance which keys will be assigned to a consumer, but a key will only be assigned to a consumer at the same time). + +consumer1 receives the following information. + +``` + +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-3", "message-3-1") +("key-3", "message-3-2") + +``` + +consumer2 receives the following information. + +``` + +("key-2", "message-2-1") +("key-2", "message-2-2") +("key-2", "message-2-3") +("key-4", "message-4-1") +("key-4", "message-4-2") + +``` + +If batching is enabled at the producer side, messages with different keys are added to a batch by default. The broker will dispatch the batch to the consumer, so the default batch mechanism may break the Key_Shared subscription guaranteed message distribution semantics. The producer needs to use the `KeyBasedBatcher`. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .batcherBuilder(BatcherBuilder.KEY_BASED) + .create(); + +``` + +Or the producer can disable batching. + +```java + +Producer producer = client.newProducer() + .topic("my-topic") + .enableBatching(false) + .create(); + +``` + +:::note + +If the message key is not specified, messages without key are dispatched to one consumer in order by default. + +::: + +## Reader + +With the [reader interface](concepts-clients.md#reader-interface), Pulsar clients can "manually position" themselves within a topic and reading all messages from a specified message onward. The Pulsar API for Java enables you to create {@inject: javadoc:Reader:/client/org/apache/pulsar/client/api/Reader} objects by specifying a topic and a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId}. + +The following is an example. + +```java + +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} + +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader iterates over each message in the topic after the message is identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +### Configure reader +When you create a reader, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Name | Type|
    Description
    | Default +|---|---|---|--- +`topicName`|String|Topic name. |None +`receiverQueueSize`|int|Size of a consumer's receiver queue.

    For example, the number of messages that can be accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.|1000 +`readerListener`|ReaderListener<T>|A listener that is called for message received.|None +`readerName`|String|Reader name.|null +`subscriptionName`|String| Subscription name|When there is a single topic, the default subscription name is `"reader-" + 10-digit UUID`.
    When there are multiple topics, the default subscription name is `"multiTopicsReader-" + 10-digit UUID`. +`subscriptionRolePrefix`|String|Prefix of subscription role. |null +`cryptoKeyReader`|CryptoKeyReader|Interface that abstracts the access to a key store.|null +`cryptoFailureAction`|ConsumerCryptoFailureAction|Consumer should take action when it receives a message that can not be decrypted.
  • **FAIL**: this is the default option to fail messages until crypto succeeds.
  • **DISCARD**: silently acknowledge and not deliver message to an application.
  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

  • The message decompression fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|
  • ConsumerCryptoFailureAction.FAIL
  • +`readCompacted`|boolean|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (for example, failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +`resetIncludeHead`|boolean|If set to true, the first message to be returned is the one specified by `messageId`.

    If set to false, the first message to be returned is the one next to the message specified by `messageId`.|false + +### Sticky key range reader + +In sticky key range reader, broker will only dispatch messages which hash of the message key contains by the specified key hash range. Multiple key hash ranges can be specified on a reader. + +The following is an example to create a sticky key range reader. + +```java + +pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.earliest) + .keyHashRange(Range.of(0, 10000), Range.of(20001, 30000)) + .create(); + +``` + +Total hash range size is 65536, so the max end of the range should be less than or equal to 65535. + +## Schema + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](schema-get-started.md) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producer) without specifying a schema, then the producer can only produce messages of type `byte[]`. The following is an example. + +```java + +Producer producer = client.newProducer() + .topic(topic) + .create(); + +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the [topic](reference-terminology.md#topic). + +### AvroBaseStructSchema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java + +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} + +``` + +You could then create a `Producer` (or `Consumer`) like this: + +```java + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); + +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + + ``` + + Or, equivalently: + + ```java + + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + + ``` + +* `String` for normal UTF-8-encoded string data. Apply the schema using `Schema.STRING`: + + ```java + + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + + ``` + +* Create JSON schemas for POJOs using `Schema.JSON`. The following is an example. + + ```java + + Producer pojoProducer = client.newProducer(Schema.JSON(MyPojo.class)) + .topic("some-pojo-topic") + .create(); + + ``` + +* Generate Protobuf schemas using `Schema.PROTOBUF`. The following example shows how to create the Protobuf schema and use it to instantiate a new producer: + + ```java + + Producer protobufProducer = client.newProducer(Schema.PROTOBUF(MyProtobuf.class)) + .topic("some-protobuf-topic") + .create(); + + ``` + +* Define Avro schemas with `Schema.AVRO`. The following code snippet demonstrates how to create and use Avro schema. + + ```java + + Producer avroProducer = client.newProducer(Schema.AVRO(MyAvro.class)) + .topic("some-avro-topic") + .create(); + + ``` + +### ProtobufNativeSchema example + +For example of ProtobufNativeSchema, see [`SchemaDefinition` in `Complex type`](schema-understand.md#complex-type). + +## Authentication + +Pulsar currently supports three authentication schemes: [TLS](security-tls-authentication.md), [Athenz](security-athenz.md), and [Oauth2](security-oauth2.md). You can use the Pulsar Java client with all of them. + +### TLS Authentication + +To use [TLS](security-tls-authentication.md), `enableTls` method is deprecated and you need to use "pulsar+ssl://" in serviceUrl to enable, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); + +``` + +### Athenz + +To use [Athenz](security-athenz.md) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. The following is an example. + +```java + +Map authParams = new HashMap(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); + +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` + +### Oauth2 + +The following example shows how to use [Oauth2](security-oauth2.md) as an authentication provider for the Pulsar Java client. + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(this.issuerUrl, this.credentialsUrl, this.audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"...","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-node.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-node.md new file mode 100644 index 0000000000000..1ff37b2629466 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-node.md @@ -0,0 +1,643 @@ +--- +id: client-libraries-node +title: The Pulsar Node.js client +sidebar_label: "Node.js" +original_id: client-libraries-node +--- + +The Pulsar Node.js client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Node.js. + +All the methods in [producers](#producers), [consumers](#consumers), and [readers](#readers) of a Node.js client are thread-safe. + +For 1.3.0 or later versions, [type definitions](https://github.com/apache/pulsar-client-node/blob/master/index.d.ts) used in TypeScript are available. + +## Installation + +You can install the [`pulsar-client`](https://www.npmjs.com/package/pulsar-client) library via [npm](https://www.npmjs.com/). + +### Requirements +Pulsar Node.js client library is based on the C++ client library. +Follow [these instructions](client-libraries-cpp.md#compilation) and install the Pulsar C++ client library. + +### Compatibility + +Compatibility between each version of the Node.js client and the C++ client is as follows: + +| Node.js client | C++ client | +| :------------- | :------------- | +| 1.0.0 | 2.3.0 or later | +| 1.1.0 | 2.4.0 or later | +| 1.2.0 | 2.5.0 or later | + +If an incompatible version of the C++ client is installed, you may fail to build or run this library. + +### Installation using npm + +Install the `pulsar-client` library via [npm](https://www.npmjs.com/): + +```shell + +$ npm install pulsar-client + +``` + +:::note + +Also, this library works only in Node.js 10.x or later because it uses the [`node-addon-api`](https://github.com/nodejs/node-addon-api) module to wrap the C++ library. + +::: + +## Connection URLs +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here is an example for `localhost`: + +```http + +pulsar://localhost:6650 + +``` + +A URL for a production Pulsar cluster may look something like this: + +```http + +pulsar://pulsar.us-west.example.com:6650 + +``` + +If you are using [TLS encryption](security-tls-transport.md) or [TLS Authentication](security-tls-authentication.md), the URL looks like this: + +```http + +pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +## Create a client + +In order to interact with Pulsar, you first need a client object. You can create a client instance using a `new` operator and the `Client` method, passing in a client options object (more on configuration [below](#client-configuration)). + +Here is an example: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + await client.close(); +})(); + +``` + +### Client configuration + +The following configurable parameters are available for Pulsar clients: + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `serviceUrl` | The connection URL for the Pulsar cluster. See [above](#connection-urls) for more info. | | +| `authentication` | Configure the authentication provider. (default: no authentication). See [TLS Authentication](security-tls-authentication.md) for more info. | | +| `operationTimeoutSeconds` | The timeout for Node.js client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries occur until this threshold is reached, at which point the operation fails. | 30 | +| `ioThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker). | 1 | +| `messageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)). | 1 | +| `concurrentLookupRequest` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 50000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 50000 | +| `tlsTrustCertsFilePath` | The file path for the trusted TLS certificate. | | +| `tlsValidateHostname` | The boolean value of setup whether to enable TLS hostname verification. | `false` | +| `tlsAllowInsecureConnection` | The boolean value of setup whether the Pulsar client accepts untrusted TLS certificate from broker. | `false` | +| `statsIntervalInSeconds` | Interval between each stat info. Stats is activated with positive statsInterval. The value should be set to 1 second at least | 600 | +| `log` | A function that is used for logging. | `console.log` | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Node.js producers using a producer configuration object. + +Here is an example: + +```JavaScript + +const producer = await client.createProducer({ + topic: 'my-topic', +}); + +await producer.send({ + data: Buffer.from("Hello, Pulsar"), +}); + +await producer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar producer, the operation returns `Promise` object and get producer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Producer operations + +Pulsar Node.js producers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `send(Object)` | Publishes a [message](#messages) to the producer's topic. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object whose result is the message ID runs executor function. | `Promise` | +| `flush()` | Sends message from send queue to Pulsar broker. When the message is successfully acknowledged by the Pulsar broker, or an error is thrown, the Promise object runs executor function. | `Promise` | +| `close()` | Closes the producer and releases all resources allocated to it. Once `close()` is called, no more messages are accepted from the publisher. This method returns a Promise object. It runs the executor function when all pending publish requests are persisted by Pulsar. If an error is thrown, no pending writes are retried. | `Promise` | +| `getProducerName()` | Getter method of the producer name. | `string` | +| `getTopic()` | Getter method of the name of the topic. | `string` | + +### Producer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer publishes messages. | | +| `producerName` | A name for the producer. If you do not explicitly assign a name, Pulsar automatically generates a globally unique name. If you choose to explicitly assign a name, it needs to be unique across *all* Pulsar clusters, otherwise the creation operation throws an error. | | +| `sendTimeoutMs` | When publishing a message to a topic, the producer waits for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error is thrown. If you set `sendTimeoutMs` to -1, the timeout is set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30000 | +| `initialSequenceId` | The initial sequence ID of the message. When producer send message, add sequence ID to message. The ID is increased each time to send. | | +| `maxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `send` method fails *unless* `blockIfQueueFull` is set to `true`. | 1000 | +| `maxPendingMessagesAcrossPartitions` | The maximum size of the sum of partition's pending queue. | 50000 | +| `blockIfQueueFull` | If set to `true`, the producer's `send` method waits when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `maxPendingMessages` parameter); if set to `false` (the default), `send` operations fails and throw a error when the queue is full. | `false` | +| `messageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-messaging.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`RoundRobinDistribution`), or publishing all messages to a single partition (`UseSinglePartition`, the default). | `UseSinglePartition` | +| `hashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `JavaStringHash` (the equivalent of `String.hashCode()` in Java), `Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library). | `BoostHash` | +| `compressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), and [`Zlib`](https://zlib.net/), [ZSTD](https://github.com/facebook/zstd/), [SNAPPY](https://github.com/google/snappy/). | Compression None | +| `batchingEnabled` | If set to `true`, the producer send message as batch. | `true` | +| `batchingMaxPublishDelayMs` | The maximum time of delay sending message in batching. | 10 | +| `batchingMaxMessages` | The maximum size of sending message in each time of batching. | 1000 | +| `properties` | The metadata of producer. | | + +### Producer example + +This example creates a Node.js producer for the `my-topic` topic and sends 10 messages to that topic: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'my-topic', + }); + + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); +})(); + +``` + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Node.js consumers using a consumer configuration object. + +Here is an example: + +```JavaScript + +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', +}); + +const msg = await consumer.receive(); +console.log(msg.getData().toString()); +consumer.acknowledge(msg); + +await consumer.close(); + +``` + +> #### Promise operation +> When you create a new Pulsar consumer, the operation returns `Promise` object and get consumer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Consumer operations + +Pulsar Node.js consumers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `receive()` | Receives a single message from the topic. When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `receive(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `acknowledge(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message object. | `void` | +| `acknowledgeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID object. | `void` | +| `acknowledgeCumulative(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `acknowledgeCumulative` method returns void, and send the ack to the broker asynchronously. After that, the messages are *not* redelivered to the consumer. Cumulative acking can not be used with a [shared](concepts-messaging.md#shared) subscription type. | `void` | +| `acknowledgeCumulativeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message ID. | `void` | +| `negativeAcknowledge(Message)`| [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message object. | `void` | +| `negativeAcknowledgeId(MessageId)` | [Negatively acknowledges](reference-terminology.md#negative-acknowledgment-nack) a message to the Pulsar broker by message ID object. | `void` | +| `close()` | Closes the consumer, disabling its ability to receive messages from the broker. | `Promise` | +| `unsubscribe()` | Unsubscribes the subscription. | `Promise` | + +### Consumer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar topic on which the consumer establishes a subscription and listen for messages. | | +| `topics` | The array of topics. | | +| `topicsPattern` | The regular expression for topics. | | +| `subscription` | The subscription name for this consumer. | | +| `subscriptionType` | Available options are `Exclusive`, `Shared`, `Key_Shared`, and `Failover`. | `Exclusive` | +| `subscriptionInitialPosition` | Initial position at which to set cursor when subscribing to a topic at first time. | `SubscriptionInitialPosition.Latest` | +| `ackTimeoutMs` | Acknowledge timeout in milliseconds. | 0 | +| `nAckRedeliverTimeoutMs` | Delay to wait before redelivering messages that failed to be processed. | 60000 | +| `receiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 | +| `receiverQueueSizeAcrossPartitions` | Set the max total receiver queue size across partitions. This setting is used to reduce the receiver queue size for individual partitions if the total exceeds this value. | 50000 | +| `consumerName` | The name of consumer. Currently(v2.4.1), [failover](concepts-messaging.md#failover) mode use consumer name in ordering. | | +| `properties` | The metadata of consumer. | | +| `listener`| A listener that is called for a message received. | | +| `readCompacted`| If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | false | + +### Consumer example + +This example creates a Node.js consumer with the `my-subscription` subscription on the `my-topic` topic, receives messages, prints the content that arrive, and acknowledges each message to the Pulsar broker for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + }); + + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); +})(); + +``` + +Instead a consumer can be created with `listener` to process messages. + +```JavaScript + +// Create a consumer +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + listener: (msg, msgConsumer) => { + console.log(msg.getData().toString()); + msgConsumer.acknowledge(msg); + }, +}); + +``` + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recently unacked message). You can [configure](#reader-configuration) Node.js readers using a reader configuration object. + +Here is an example: + +```JavaScript + +const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), +}); + +const msg = await reader.readNext(); +console.log(msg.getData().toString()); + +await reader.close(); + +``` + +### Reader operations + +Pulsar Node.js readers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `readNext()` | Receives the next message on the topic (analogous to the `receive` method for [consumers](#consumer-operations)). When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `readNext(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `hasNext()` | Return whether the broker has next message in target topic. | `Boolean` | +| `close()` | Closes the reader, disabling its ability to receive messages from the broker. | `Promise` | + +### Reader configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader establishes a subscription and listen for messages. | | +| `startMessageId` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `Pulsar.MessageId.earliest` (the earliest available message on the topic), `Pulsar.MessageId.latest` (the latest available message on the topic), or a message ID object for a position that is not earliest or latest. | | +| `receiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `readNext`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 | +| `readerName` | The name of the reader. | | +| `subscriptionRolePrefix` | The subscription role prefix. | | +| `readCompacted` | If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`. | `false` | + + +### Reader example + +This example creates a Node.js reader with the `my-topic` topic, reads messages, and prints the content that arrive for 10 times: + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a reader + const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), + }); + + // read messages + for (let i = 0; i < 10; i += 1) { + const msg = await reader.readNext(); + console.log(msg.getData().toString()); + } + + await reader.close(); + await client.close(); +})(); + +``` + +## Messages + +In Pulsar Node.js client, you have to construct producer message object for producer. + +Here is an example message: + +```JavaScript + +const msg = { + data: Buffer.from('Hello, Pulsar'), + partitionKey: 'key1', + properties: { + 'foo': 'bar', + }, + eventTimestamp: Date.now(), + replicationClusters: [ + 'cluster1', + 'cluster2', + ], +} + +await producer.send(msg); + +``` + +The following keys are available for producer message objects: + +| Parameter | Description | +| :-------- | :---------- | +| `data` | The actual data payload of the message. | +| `properties` | A Object for any application-specific metadata attached to the message. | +| `eventTimestamp` | The timestamp associated with the message. | +| `sequenceId` | The sequence ID of the message. | +| `partitionKey` | The optional key associated with the message (particularly useful for things like topic compaction). | +| `replicationClusters` | The clusters to which this message is replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. | +| `deliverAt` | The absolute timestamp at or after which the message is delivered. | | +| `deliverAfter` | The relative delay after which the message is delivered. | | + +### Message object operations + +In Pulsar Node.js client, you can receive (or read) message object as consumer (or reader). + +The message object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `getTopicName()` | Getter method of topic name. | `String` | +| `getProperties()` | Getter method of properties. | `Array` | +| `getData()` | Getter method of message data. | `Buffer` | +| `getMessageId()` | Getter method of [message id object](#message-id-object-operations). | `Object` | +| `getPublishTimestamp()` | Getter method of publish timestamp. | `Number` | +| `getEventTimestamp()` | Getter method of event timestamp. | `Number` | +| `getRedeliveryCount()` | Getter method of redelivery count. | `Number` | +| `getPartitionKey()` | Getter method of partition key. | `String` | + +### Message ID object operations + +In Pulsar Node.js client, you can get message id object from message object. + +The message id object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `serialize()` | Serialize the message id into a Buffer for storing. | `Buffer` | +| `toString()` | Get message id as String. | `String` | + +The client has static method of message id object. You can access it as `Pulsar.MessageId.someStaticMethod` too. + +The following static methods are available for the message id object: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `earliest()` | MessageId representing the earliest, or oldest available message stored in the topic. | `Object` | +| `latest()` | MessageId representing the latest, or last published message in the topic. | `Object` | +| `deserialize(Buffer)` | Deserialize a message id object from a Buffer. | `Object` | + +## End-to-end encryption + +[End-to-end encryption](https://pulsar.apache.org/docs/en/next/cookbooks-encryption/#docsNav) allows applications to encrypt messages at producers and decrypt at consumers. + +### Configuration + +If you want to use the end-to-end encryption feature in the Node.js client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Node.js client. + +**Prerequisite** + +- Pulsar C++ client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'persistent://public/default/my-topic', + sendTimeoutMs: 30000, + batchingEnabled: true, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem", + encryptionKey: "encryption-key" + }); + + console.log(producer.ProducerConfig) + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); + })(); + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```nodejs + + const Pulsar = require('pulsar-client'); + + (async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://172.25.0.3:6650', + operationTimeoutSeconds: 30 + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'persistent://public/default/my-topic', + subscription: 'sub1', + subscriptionType: 'Shared', + ackTimeoutMs: 10000, + publicKeyPath: "./public.pem", + privateKeyPath: "./private.pem" + }); + + console.log(consumer) + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); + })(); + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + node consumer.js + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + node producer.js + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + Sent message: my-message-0 + Sent message: my-message-1 + Sent message: my-message-2 + Sent message: my-message-3 + Sent message: my-message-4 + Sent message: my-message-5 + Sent message: my-message-6 + Sent message: my-message-7 + Sent message: my-message-8 + Sent message: my-message-9 + + ``` + + This is from the consumer side. + + ``` + + my-message-0 + my-message-1 + my-message-2 + my-message-3 + my-message-4 + my-message-5 + my-message-6 + my-message-7 + my-message-8 + my-message-9 + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-python.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-python.md new file mode 100644 index 0000000000000..90cc840daa0a8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-python.md @@ -0,0 +1,481 @@ +--- +id: client-libraries-python +title: Pulsar Python client +sidebar_label: "Python" +original_id: client-libraries-python +--- + +Pulsar Python client library is a wrapper over the existing [C++ client library](client-libraries-cpp.md) and exposes all of the [same features](/api/cpp). You can find the code in the [Python directory](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/python) of the C++ client code. + +All the methods in producer, consumer, and reader of a Python client are thread-safe. + +[pdoc](https://github.com/BurntSushi/pdoc)-generated API docs for the Python client are available [here](/api/python). + +## Install + +You can install the [`pulsar-client`](https://pypi.python.org/pypi/pulsar-client) library either via [PyPi](https://pypi.python.org/pypi), using [pip](#installation-using-pip), or by building the library from [source](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp). + +### Install using pip + +To install the `pulsar-client` library as a pre-built package using the [pip](https://pip.pypa.io/en/stable/) package manager: + +```shell + +$ pip install pulsar-client==@pulsar:version_number@ + +``` + +### Optional dependencies +If you install the client libraries on Linux to support services like Pulsar functions or Avro serialization, you can install optional components alongside the `pulsar-client` library. + +```shell + +# avro serialization +$ pip install pulsar-client=='@pulsar:version_number@[avro]' + +# functions runtime +$ pip install pulsar-client=='@pulsar:version_number@[functions]' + +# all optional components +$ pip install pulsar-client=='@pulsar:version_number@[all]' + +``` + +Installation via PyPi is available for the following Python versions: + +Platform | Supported Python versions +:--------|:------------------------- +MacOS
    10.13 (High Sierra), 10.14 (Mojave)
    | 2.7, 3.7 +Linux | 2.7, 3.4, 3.5, 3.6, 3.7, 3.8 + +### Install from source + +To install the `pulsar-client` library by building from source, follow [instructions](client-libraries-cpp.md#compilation) and compile the Pulsar C++ client library. That builds the Python binding for the library. + +To install the built Python bindings: + +```shell + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/pulsar-client-cpp/python +$ sudo python setup.py install + +``` + +## API Reference + +The complete Python API reference is available at [api/python](/api/python). + +## Examples + +You can find a variety of Python code examples for the [pulsar-client](/pulsar-client-cpp/python) library. + +### Producer example + +The following example creates a Python producer for the `my-topic` topic and sends 10 messages on that topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('Hello-%d' % i).encode('utf-8')) + +client.close() + +``` + +### Consumer example + +The following example creates a consumer with the `my-subscription` subscription name on the `my-topic` topic, receives incoming messages, prints the content and ID of messages that arrive, and acknowledges each message to the Pulsar broker. + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +consumer = client.subscribe('my-topic', 'my-subscription') + +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +client.close() + +``` + +This example shows how to configure negative acknowledgement. + +```python + +from pulsar import Client, schema +client = Client('pulsar://localhost:6650') +consumer = client.subscribe('negative_acks','test',schema=schema.StringSchema()) +producer = client.create_producer('negative_acks',schema=schema.StringSchema()) +for i in range(10): + print('send msg "hello-%d"' % i) + producer.send_async('hello-%d' % i, callback=None) +producer.flush() +for i in range(10): + msg = consumer.receive() + consumer.negative_acknowledge(msg) + print('receive and nack msg "%s"' % msg.data()) +for i in range(10): + msg = consumer.receive() + consumer.acknowledge(msg) + print('receive and ack msg "%s"' % msg.data()) +try: + # No more messages expected + msg = consumer.receive(100) +except: + print("no more msg") + pass + +``` + +### Reader interface example + +You can use the Pulsar Python API to use the Pulsar [reader interface](concepts-clients.md#reader-interface). Here's an example: + +```python + +# MessageId taken from a previously fetched message +msg_id = msg.message_id() + +reader = client.create_reader('my-topic', msg_id) + +while True: + msg = reader.read_next() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # No acknowledgment + +``` + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously. To use multi-topic subscriptions, you can supply a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The following is an example: + +```python + +import re +consumer = client.subscribe(re.compile('persistent://public/default/topic-*'), 'my-subscription') +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) +client.close() + +``` + +## Schema + +### Declare and validate schema + +You can declare a schema by passing a class that inherits +from `pulsar.schema.Record` and defines the fields as +class variables. For example: + +```python + +from pulsar.schema import * + +class Example(Record): + a = String() + b = Integer() + c = Boolean() + +``` + +With this simple schema definition, you can create producers, consumers and readers instances that refer to that. + +```python + +producer = client.create_producer( + topic='my-topic', + schema=AvroSchema(Example) ) + +producer.send(Example(a='Hello', b=1)) + +``` + +After creating the producer, the Pulsar broker validates that the existing topic schema is indeed of "Avro" type and that the format is compatible with the schema definition of the `Example` class. + +If there is a mismatch, an exception occurs in the producer creation. + +Once a producer is created with a certain schema definition, +it will only accept objects that are instances of the declared +schema class. + +Similarly, for a consumer/reader, the consumer will return an +object, instance of the schema record class, rather than the raw +bytes: + +```python + +consumer = client.subscribe( + topic='my-topic', + subscription_name='my-subscription', + schema=AvroSchema(Example) ) + +while True: + msg = consumer.receive() + ex = msg.value() + try: + print("Received message a={} b={} c={}".format(ex.a, ex.b, ex.c)) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +``` + +### Supported schema types + +You can use different builtin schema types in Pulsar. All the definitions are in the `pulsar.schema` package. + +| Schema | Notes | +| ------ | ----- | +| `BytesSchema` | Get the raw payload as a `bytes` object. No serialization/deserialization are performed. This is the default schema mode | +| `StringSchema` | Encode/decode payload as a UTF-8 string. Uses `str` objects | +| `JsonSchema` | Require record definition. Serializes the record into standard JSON payload | +| `AvroSchema` | Require record definition. Serializes in AVRO format | + +### Schema definition reference + +The schema definition is done through a class that inherits from `pulsar.schema.Record`. + +This class has a number of fields which can be of either +`pulsar.schema.Field` type or another nested `Record`. All the +fields are specified in the `pulsar.schema` package. The fields +are matching the AVRO fields types. + +| Field Type | Python Type | Notes | +| ---------- | ----------- | ----- | +| `Boolean` | `bool` | | +| `Integer` | `int` | | +| `Long` | `int` | | +| `Float` | `float` | | +| `Double` | `float` | | +| `Bytes` | `bytes` | | +| `String` | `str` | | +| `Array` | `list` | Need to specify record type for items. | +| `Map` | `dict` | Key is always `String`. Need to specify value type. | + +Additionally, any Python `Enum` type can be used as a valid field type. + +#### Fields parameters + +When adding a field, you can use these parameters in the constructor. + +| Argument | Default | Notes | +| ---------- | --------| ----- | +| `default` | `None` | Set a default value for the field. Eg: `a = Integer(default=5)` | +| `required` | `False` | Mark the field as "required". It is set in the schema accordingly. | + +#### Schema definition examples + +##### Simple definition + +```python + +class Example(Record): + a = String() + b = Integer() + c = Array(String()) + i = Map(String()) + +``` + +##### Using enums + +```python + +from enum import Enum + +class Color(Enum): + red = 1 + green = 2 + blue = 3 + +class Example(Record): + name = String() + color = Color + +``` + +##### Complex types + +```python + +class MySubRecord(Record): + x = Integer() + y = Long() + z = String() + +class Example(Record): + a = String() + sub = MySubRecord() + +``` + +##### Set namespace for Avro schema + +Set the namespace for Avro Record schema using the special field `_avro_namespace`. + +```python + +class NamespaceDemo(Record): + _avro_namespace = 'xxx.xxx.xxx' + x = String() + y = Integer() + +``` + +The schema definition is like this. + +``` + +{ + 'name': 'NamespaceDemo', 'namespace': 'xxx.xxx.xxx', 'type': 'record', 'fields': [ + {'name': 'x', 'type': ['null', 'string']}, + {'name': 'y', 'type': ['null', 'int']} + ] +} + +``` + +## End-to-end encryption + +[End-to-end encryption](https://pulsar.apache.org/docs/en/next/cookbooks-encryption/#docsNav) allows applications to encrypt messages at producers and decrypt messages at consumers. + +### Configuration + +To use the end-to-end encryption feature in the Python client, you need to configure `publicKeyPath` and `privateKeyPath` for both producer and consumer. + +``` + +publicKeyPath: "./public.pem" +privateKeyPath: "./private.pem" + +``` + +### Tutorial + +This section provides step-by-step instructions on how to use the end-to-end encryption feature in the Python client. + +**Prerequisite** + +- Pulsar Python client 2.7.1 or later + +**Step** + +1. Create both public and private key pairs. + + **Input** + + ```shell + + openssl genrsa -out private.pem 2048 + openssl rsa -in private.pem -pubout -out public.pem + + ``` + +2. Create a producer to send encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + producer = client.create_producer(topic='encryption', encryption_key='encryption', crypto_key_reader=crypto_key_reader) + producer.send('encryption message'.encode('utf8')) + print('sent message') + producer.close() + client.close() + + ``` + +3. Create a consumer to receive encrypted messages. + + **Input** + + ```python + + import pulsar + + publicKeyPath = "./public.pem" + privateKeyPath = "./private.pem" + crypto_key_reader = pulsar.CryptoKeyReader(publicKeyPath, privateKeyPath) + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe(topic='encryption', subscription_name='encryption-sub', crypto_key_reader=crypto_key_reader) + msg = consumer.receive() + print("Received msg '{}' id = '{}'".format(msg.data(), msg.message_id())) + consumer.close() + client.close() + + ``` + +4. Run the consumer to receive encrypted messages. + + **Input** + + ```shell + + python consumer.py + + ``` + +5. In a new terminal tab, run the producer to produce encrypted messages. + + **Input** + + ```shell + + python producer.py + + ``` + + Now you can see the producer sends messages and the consumer receives messages successfully. + + **Output** + + This is from the producer side. + + ``` + + sent message + + ``` + + This is from the consumer side. + + ``` + + Received msg 'encryption message' id = '(0,0,-1,-1)' + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries-websocket.md b/site2/website/versioned_docs/version-2.9.x/client-libraries-websocket.md new file mode 100644 index 0000000000000..60970c7ea4df2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries-websocket.md @@ -0,0 +1,664 @@ +--- +id: client-libraries-websocket +title: Pulsar WebSocket API +sidebar_label: "WebSocket" +original_id: client-libraries-websocket +--- + +Pulsar [WebSocket](https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API) API provides a simple way to interact with Pulsar using languages that do not have an official [client library](getting-started-clients.md). Through WebSocket, you can publish and consume messages and use features available on the [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + + +> You can use Pulsar WebSocket API with any WebSocket client library. See examples for Python and Node.js [below](#client-examples). + +## Running the WebSocket service + +The standalone variant of Pulsar that we recommend using for [local development](getting-started-standalone.md) already has the WebSocket service enabled. + +In non-standalone mode, there are two ways to deploy the WebSocket service: + +* [embedded](#embedded-with-a-pulsar-broker) with a Pulsar broker +* as a [separate component](#as-a-separate-component) + +### Embedded with a Pulsar broker + +In this mode, the WebSocket service will run within the same HTTP service that's already running in the broker. To enable this mode, set the [`webSocketServiceEnabled`](reference-configuration.md#broker-webSocketServiceEnabled) parameter in the [`conf/broker.conf`](reference-configuration.md#broker) configuration file in your installation. + +```properties + +webSocketServiceEnabled=true + +``` + +### As a separate component + +In this mode, the WebSocket service will be run from a Pulsar [broker](reference-terminology.md#broker) as a separate service. Configuration for this mode is handled in the [`conf/websocket.conf`](reference-configuration.md#websocket) configuration file. You'll need to set *at least* the following parameters: + +* [`configurationStoreServers`](reference-configuration.md#websocket-configurationStoreServers) +* [`webServicePort`](reference-configuration.md#websocket-webServicePort) +* [`clusterName`](reference-configuration.md#websocket-clusterName) + +Here's an example: + +```properties + +configurationStoreServers=zk1:2181,zk2:2181,zk3:2181 +webServicePort=8080 +clusterName=my-cluster + +``` + +### Security settings + +To enable TLS encryption on WebSocket service: + +```properties + +tlsEnabled=true +tlsAllowInsecureConnection=false +tlsCertificateFilePath=/path/to/client-websocket.cert.pem +tlsKeyFilePath=/path/to/client-websocket.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +### Starting the broker + +When the configuration is set, you can start the service using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) tool: + +```shell + +$ bin/pulsar-daemon start websocket + +``` + +## API Reference + +Pulsar's WebSocket API offers three endpoints for [producing](#producer-endpoint) messages, [consuming](#consumer-endpoint) messages and [reading](#reader-endpoint) messages. + +All exchanges via the WebSocket API use JSON. + +### Authentication + +#### Browser javascript WebSocket client + +Use the query param `token` transport the authentication token. + +```http + +ws://broker-service-url:8080/path?token=token + +``` + +### Producer endpoint + +The producer endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/producer/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`sendTimeoutMillis` | long | no | Send timeout (default: 30 secs) +`batchingEnabled` | boolean | no | Enable batching of messages (default: false) +`batchingMaxMessages` | int | no | Maximum number of messages permitted in a batch (default: 1000) +`maxPendingMessages` | int | no | Set the max size of the internal-queue holding the messages (default: 1000) +`batchingMaxPublishDelay` | long | no | Time period within which the messages will be batched (default: 10ms) +`messageRoutingMode` | string | no | Message [routing mode](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/ProducerConfiguration.MessageRoutingMode.html) for the partitioned producer: `SinglePartition`, `RoundRobinPartition` +`compressionType` | string | no | Compression [type](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/CompressionType.html): `LZ4`, `ZLIB` +`producerName` | string | no | Specify the name for the producer. Pulsar will enforce only one producer with same name can be publishing on a topic +`initialSequenceId` | long | no | Set the baseline for the sequence ids for messages published by the producer. +`hashingScheme` | string | no | [Hashing function](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.HashingScheme.html) to use when publishing on a partitioned topic: `JavaStringHash`, `Murmur3_32Hash` +`token` | string | no | Authentication token, this is used for the browser javascript client + + +#### Publishing a message + +```json + +{ + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "context": "1" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`payload` | string | yes | Base-64 encoded payload +`properties` | key-value pairs | no | Application-defined properties +`context` | string | no | Application-defined request identifier +`key` | string | no | For partitioned topics, decides which partition to use +`replicationClusters` | array | no | Restrict replication to this list of [clusters](reference-terminology.md#cluster), specified by name + + +##### Example success response + +```json + +{ + "result": "ok", + "messageId": "CAAQAw==", + "context": "1" + } + +``` + +##### Example failure response + +```json + + { + "result": "send-error:3", + "errorMsg": "Failed to de-serialize from JSON", + "context": "1" + } + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`result` | string | yes | `ok` if successful or an error message if unsuccessful +`messageId` | string | yes | Message ID assigned to the published message +`context` | string | no | Application-defined request identifier + + +### Consumer endpoint + +The consumer endpoint requires you to specify a tenant, namespace, and topic, as well as a subscription, in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/consumer/persistent/:tenant/:namespace/:topic/:subscription + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`ackTimeoutMillis` | long | no | Set the timeout for unacked messages (default: 0) +`subscriptionType` | string | no | [Subscription type](https://pulsar.apache.org/api/client/index.html?org/apache/pulsar/client/api/SubscriptionType.html): `Exclusive`, `Failover`, `Shared`, `Key_Shared` +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`consumerName` | string | no | Consumer name +`priorityLevel` | int | no | Define a [priority](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setPriorityLevel-int-) for the consumer +`maxRedeliverCount` | int | no | Define a [maxRedeliverCount](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: 0). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`deadLetterTopic` | string | no | Define a [deadLetterTopic](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#deadLetterPolicy-org.apache.pulsar.client.api.DeadLetterPolicy-) for the consumer (default: {topic}-{subscription}-DLQ). Activates [Dead Letter Topic](https://github.com/apache/pulsar/wiki/PIP-22%3A-Pulsar-Dead-Letter-Topic) feature. +`pullMode` | boolean | no | Enable pull mode (default: false). See "Flow Control" below. +`negativeAckRedeliveryDelay` | int | no | When a message is negatively acknowledged, the delay time before the message is redelivered (in milliseconds). The default value is 60000. +`token` | string | no | Authentication token, this is used for the browser javascript client + +NB: these parameter (except `pullMode`) apply to the internal consumer of the WebSocket service. +So messages will be subject to the redelivery settings as soon as the get into the receive queue, +even if the client doesn't consume on the WebSocket. + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ +"messageId": "CAMQADAA", + "payload": "hvXcJvHW7kOSrUn17P2q71RA5SdiXwZBqw==", + "properties": {}, + "publishTime": "2021-10-29T16:01:38.967-07:00", + "redeliveryCount": 0, + "encryptionContext": { + "keys": { + "client-rsa.pem": { + "keyValue": "jEuwS+PeUzmCo7IfLNxqoj4h7txbLjCQjkwpaw5AWJfZ2xoIdMkOuWDkOsqgFmWwxiecakS6GOZHs94x3sxzKHQx9Oe1jpwBg2e7L4fd26pp+WmAiLm/ArZJo6JotTeFSvKO3u/yQtGTZojDDQxiqFOQ1ZbMdtMZA8DpSMuq+Zx7PqLo43UdW1+krjQfE5WD+y+qE3LJQfwyVDnXxoRtqWLpVsAROlN2LxaMbaftv5HckoejJoB4xpf/dPOUqhnRstwQHf6klKT5iNhjsY4usACt78uILT0pEPd14h8wEBidBz/vAlC/zVMEqiDVzgNS7dqEYS4iHbf7cnWVCn3Hxw==", + "metadata": {} + } + }, + "param": "Tfu1PxVm6S9D3+Hk", + "compressionType": "NONE", + "uncompressedMessageSize": 0, + "batchSize": { + "empty": false, + "present": true + } + } +} + +``` + +Below are the parameters in the WebSocket consumer response. + +- General parameters + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `messageId` | string | yes | Message ID + `payload` | string | yes | Base-64 encoded payload + `publishTime` | string | yes | Publish timestamp + `redeliveryCount` | number | yes | Number of times this message was already delivered + `properties` | key-value pairs | no | Application-defined properties + `key` | string | no | Original routing key set by producer + `encryptionContext` | EncryptionContext | no | Encryption context that consumers can use to decrypt received messages + `param` | string | no | Initialization vector for cipher (Base64 encoding) + `batchSize` | string | no | Number of entries in a message (if it is a batch message) + `uncompressedMessageSize` | string | no | Message size before compression + `compressionType` | string | no | Algorithm used to compress the message payload + +- `encryptionContext` related parameter + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `keys` |key-EncryptionKey pairs | yes | Key in `key-EncryptionKey` pairs is an encryption key name. Value in `key-EncryptionKey` pairs is an encryption key object. + +- `encryptionKey` related parameters + + Key | Type | Required? | Explanation + :---|:-----|:----------|:----------- + `keyValue` | string | yes | Encryption key (Base64 encoding) + `metadata` | key-value pairs | no | Application-defined metadata + +#### Acknowledging the message + +Consumer needs to acknowledge the successful processing of the message to +have the Pulsar broker delete it. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Negatively acknowledging messages + +```json + +{ + "type": "negativeAcknowledge", + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Flow control + +##### Push Mode + +By default (`pullMode=false`), the consumer endpoint will use the `receiverQueueSize` parameter both to size its +internal receive queue and to limit the number of unacknowledged messages that are passed to the WebSocket client. +In this mode, if you don't send acknowledgements, the Pulsar WebSocket service will stop sending messages after reaching +`receiverQueueSize` unacked messages sent to the WebSocket client. + +##### Pull Mode + +If you set `pullMode` to `true`, the WebSocket client will need to send `permit` commands to permit the +Pulsar WebSocket service to send more messages. + +```json + +{ + "type": "permit", + "permitMessages": 100 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `permit` +`permitMessages`| int | yes | Number of messages to permit + +NB: in this mode it's possible to acknowledge messages in a different connection. + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Reader endpoint + +The reader endpoint requires you to specify a tenant, namespace, and topic in the URL: + +```http + +ws://broker-service-url:8080/ws/v2/reader/persistent/:tenant/:namespace/:topic + +``` + +##### Query param + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`readerName` | string | no | Reader name +`receiverQueueSize` | int | no | Size of the consumer receive queue (default: 1000) +`messageId` | int or enum | no | Message ID to start from, `earliest` or `latest` (default: `latest`) +`token` | string | no | Authentication token, this is used for the browser javascript client + +##### Receiving messages + +Server will push messages on the WebSocket session: + +```json + +{ + "messageId": "CAAQAw==", + "payload": "SGVsbG8gV29ybGQ=", + "properties": {"key1": "value1", "key2": "value2"}, + "publishTime": "2016-08-30 16:45:57.785", + "redeliveryCount": 4 +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId` | string | yes | Message ID +`payload` | string | yes | Base-64 encoded payload +`publishTime` | string | yes | Publish timestamp +`redeliveryCount` | number | yes | Number of times this message was already delivered +`properties` | key-value pairs | no | Application-defined properties +`key` | string | no | Original routing key set by producer + +#### Acknowledging the message + +**In WebSocket**, Reader needs to acknowledge the successful processing of the message to +have the Pulsar WebSocket service update the number of pending messages. +If you don't send acknowledgements, Pulsar WebSocket service will stop sending messages after reaching the pendingMessages limit. + +```json + +{ + "messageId": "CAAQAw==" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`messageId`| string | yes | Message ID of the processed message + +#### Check if reach end of topic + +Consumer can check if it has reached end of topic by sending `isEndOfTopic` request. + +**Request** + +```json + +{ + "type": "isEndOfTopic" +} + +``` + +Key | Type | Required? | Explanation +:---|:-----|:----------|:----------- +`type`| string | yes | Type of command. Must be `isEndOfTopic` + +**Response** + +```json + +{ + "endOfTopic": "true/false" + } + +``` + +### Error codes + +In case of error the server will close the WebSocket session using the +following error codes: + +Error Code | Error Message +:----------|:------------- +1 | Failed to create producer +2 | Failed to subscribe +3 | Failed to deserialize from JSON +4 | Failed to serialize to JSON +5 | Failed to authenticate client +6 | Client is not authorized +7 | Invalid payload encoding +8 | Unknown error + +> The application is responsible for re-establishing a new WebSocket session after a backoff period. + +## Client examples + +Below you'll find code examples for the Pulsar WebSocket API in [Python](#python) and [Node.js](#nodejs). + +### Python + +This example uses the [`websocket-client`](https://pypi.python.org/pypi/websocket-client) package. You can install it using [pip](https://pypi.python.org/pypi/pip): + +```shell + +$ pip install websocket-client + +``` + +You can also download it from [PyPI](https://pypi.python.org/pypi/websocket-client). + +#### Python producer + +Here's an example Python producer that sends a simple message to a Pulsar [topic](reference-terminology.md#topic): + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/producer/persistent/public/default/my-topic' + +ws = websocket.create_connection(TOPIC) + +# encode message +s = "Hello World" +firstEncoded = s.encode("UTF-8") +binaryEncoded = base64.b64encode(firstEncoded) +payloadString = binaryEncoded.decode('UTF-8') + +# Send one message as JSON +ws.send(json.dumps({ + 'payload' : payloadString, + 'properties': { + 'key1' : 'value1', + 'key2' : 'value2' + }, + 'context' : 5 +})) + +response = json.loads(ws.recv()) +if response['result'] == 'ok': + print( 'Message published successfully') +else: + print('Failed to publish message:', response) +ws.close() + +``` + +#### Python consumer + +Here's an example Python consumer that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub' + +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print( "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload']))) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +#### Python reader + +Here's an example Python reader that listens on a Pulsar topic and prints the message ID whenever a message arrives: + +```python + +import websocket, base64, json + +# If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +enable_TLS = False +scheme = 'ws' +if enable_TLS: + scheme = 'wss' + +TOPIC = scheme + '://localhost:8080/ws/v2/reader/persistent/public/default/my-topic' +ws = websocket.create_connection(TOPIC) + +while True: + msg = json.loads(ws.recv()) + if not msg: break + + print ( "Received: {} - payload: {}".format(msg, base64.b64decode(msg['payload']))) + + # Acknowledge successful processing + ws.send(json.dumps({'messageId' : msg['messageId']})) + +ws.close() + +``` + +### Node.js + +This example uses the [`ws`](https://websockets.github.io/ws/) package. You can install it using [npm](https://www.npmjs.com/): + +```shell + +$ npm install ws + +``` + +#### Node.js producer + +Here's an example Node.js producer that sends a simple message to a Pulsar topic: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/producer/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +var message = { + "payload" : new Buffer("Hello World").toString('base64'), + "properties": { + "key1" : "value1", + "key2" : "value2" + }, + "context" : "1" +}; + +ws.on('open', function() { + // Send one message + ws.send(JSON.stringify(message)); +}); + +ws.on('message', function(message) { + console.log('received ack: %s', message); +}); + +``` + +#### Node.js consumer + +Here's an example Node.js consumer that listens on the same topic used by the producer above: + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/consumer/persistent/public/default/my-topic/my-sub`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + +#### NodeJS reader + +```javascript + +const WebSocket = require('ws'); + +// If set enableTLS to true, your have to set tlsEnabled to true in conf/websocket.conf. +const enableTLS = false; +const topic = `${enableTLS ? 'wss' : 'ws'}://localhost:8080/ws/v2/reader/persistent/public/default/my-topic`; +const ws = new WebSocket(topic); + +ws.on('message', function(message) { + var receiveMsg = JSON.parse(message); + console.log('Received: %s - payload: %s', message, new Buffer(receiveMsg.payload, 'base64').toString()); + var ackMsg = {"messageId" : receiveMsg.messageId}; + ws.send(JSON.stringify(ackMsg)); +}); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/client-libraries.md b/site2/website/versioned_docs/version-2.9.x/client-libraries.md new file mode 100644 index 0000000000000..607c9317e4b7f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/client-libraries.md @@ -0,0 +1,36 @@ +--- +id: client-libraries +title: Pulsar client libraries +sidebar_label: "Overview" +original_id: client-libraries +--- + +Pulsar supports the following client libraries: + +- [Java client](client-libraries-java.md) +- [Go client](client-libraries-go.md) +- [Python client](client-libraries-python.md) +- [C++ client](client-libraries-cpp.md) +- [Node.js client](client-libraries-node.md) +- [WebSocket client](client-libraries-websocket.md) +- [C# client](client-libraries-dotnet.md) + +## Feature matrix +Pulsar client feature matrix for different languages is listed on [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + +## Third-party clients + +Besides the official released clients, multiple projects on developing Pulsar clients are available in different languages. + +> If you have developed a new Pulsar client, feel free to submit a pull request and add your client to the list below. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| Go | [pulsar-client-go](https://github.com/Comcast/pulsar-client-go) | [Comcast](https://github.com/Comcast) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | +| Go | [go-pulsar](https://github.com/t2y/go-pulsar) | [t2y](https://github.com/t2y) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | +| Haskell | [supernova](https://github.com/cr-org/supernova) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Native Pulsar client for Haskell | +| Scala | [neutron](https://github.com/cr-org/neutron) | [Chatroulette](https://github.com/cr-org) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Purely functional Apache Pulsar client for Scala built on top of Fs2 | +| Scala | [pulsar4s](https://github.com/sksamuel/pulsar4s) | [sksamuel](https://github.com/sksamuel) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Idomatic, typesafe, and reactive Scala client for Apache Pulsar | +| Rust | [pulsar-rs](https://github.com/wyyerd/pulsar-rs) | [Wyyerd Group](https://github.com/wyyerd) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Future-based Rust bindings for Apache Pulsar | +| .NET | [pulsar-client-dotnet](https://github.com/fsharplang-ru/pulsar-client-dotnet) | [Lanayx](https://github.com/Lanayx) | [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native .NET client for C#/F#/VB | +| Node.js | [pulsar-flex](https://github.com/ayeo-flex-org/pulsar-flex) | [Daniel Sinai](https://github.com/danielsinai), [Ron Farkash](https://github.com/ronfarkash), [Gal Rosenberg](https://github.com/galrose)| [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native Nodejs client | diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-architecture-overview.md b/site2/website/versioned_docs/version-2.9.x/concepts-architecture-overview.md new file mode 100644 index 0000000000000..4baa8c30a0d00 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-architecture-overview.md @@ -0,0 +1,172 @@ +--- +id: concepts-architecture-overview +title: Architecture Overview +sidebar_label: "Architecture" +original_id: concepts-architecture-overview +--- + +At the highest level, a Pulsar instance is composed of one or more Pulsar clusters. Clusters within an instance can [replicate](concepts-replication.md) data amongst themselves. + +In a Pulsar cluster: + +* One or more brokers handles and load balances incoming messages from producers, dispatches messages to consumers, communicates with the Pulsar configuration store to handle various coordination tasks, stores messages in BookKeeper instances (aka bookies), relies on a cluster-specific ZooKeeper cluster for certain tasks, and more. +* A BookKeeper cluster consisting of one or more bookies handles [persistent storage](#persistent-storage) of messages. +* A ZooKeeper cluster specific to that cluster handles coordination tasks between Pulsar clusters. + +The diagram below provides an illustration of a Pulsar cluster: + +![Pulsar architecture diagram](/assets/pulsar-system-architecture.png) + +At the broader instance level, an instance-wide ZooKeeper cluster called the configuration store handles coordination tasks involving multiple clusters, for example [geo-replication](concepts-replication.md). + +## Brokers + +The Pulsar message broker is a stateless component that's primarily responsible for running two other components: + +* An HTTP server that exposes a {@inject: rest:REST:/} API for both administrative tasks and [topic lookup](concepts-clients.md#client-setup-phase) for producers and consumers. The producers connect to the brokers to publish messages and the consumers connect to the brokers to consume the messages. +* A dispatcher, which is an asynchronous TCP server over a custom [binary protocol](developing-binary-protocol.md) used for all data transfers + +Messages are typically dispatched out of a [managed ledger](#managed-ledgers) cache for the sake of performance, *unless* the backlog exceeds the cache size. If the backlog grows too large for the cache, the broker will start reading entries from BookKeeper. + +Finally, to support geo-replication on global topics, the broker manages replicators that tail the entries published in the local region and republish them to the remote region using the Pulsar [Java client library](client-libraries-java.md). + +> For a guide to managing Pulsar brokers, see the [brokers](admin-api-brokers.md) guide. + +## Clusters + +A Pulsar instance consists of one or more Pulsar *clusters*. Clusters, in turn, consist of: + +* One or more Pulsar [brokers](#brokers) +* A ZooKeeper quorum used for cluster-level configuration and coordination +* An ensemble of bookies used for [persistent storage](#persistent-storage) of messages + +Clusters can replicate amongst themselves using [geo-replication](concepts-replication.md). + +> For a guide to managing Pulsar clusters, see the [clusters](admin-api-clusters.md) guide. + +## Metadata store + +The Pulsar metadata store maintains all the metadata of a Pulsar cluster, such as topic metadata, schema, broker load data, and so on. Pulsar uses [Apache ZooKeeper](https://zookeeper.apache.org/) for metadata storage, cluster configuration, and coordination. The Pulsar metadata store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. You can use one ZooKeeper cluster for both Pulsar metadata store and BookKeeper metadata store. If you want to deploy Pulsar brokers connected to an existing BookKeeper cluster, you need to deploy separate ZooKeeper clusters for Pulsar metadata store and BookKeeper metadata store respectively. + +In a Pulsar instance: + +* A configuration store quorum stores configuration for tenants, namespaces, and other entities that need to be globally consistent. +* Each cluster has its own local ZooKeeper ensemble that stores cluster-specific configuration and coordination such as which brokers are responsible for which topics as well as ownership metadata, broker load reports, BookKeeper ledger metadata, and more. + +## Configuration store + +The configuration store maintains all the configurations of a Pulsar instance, such as clusters, tenants, namespaces, partitioned topic related configurations, and so on. A Pulsar instance can have a single local cluster, multiple local clusters, or multiple cross-region clusters. Consequently, the configuration store can share the configurations across multiple clusters under a Pulsar instance. The configuration store can be deployed on a separate ZooKeeper cluster or deployed on an existing ZooKeeper cluster. + +## Persistent storage + +Pulsar provides guaranteed message delivery for applications. If a message successfully reaches a Pulsar broker, it will be delivered to its intended target. + +This guarantee requires that non-acknowledged messages are stored in a durable manner until they can be delivered to and acknowledged by consumers. This mode of messaging is commonly called *persistent messaging*. In Pulsar, N copies of all messages are stored and synced on disk, for example 4 copies across two servers with mirrored [RAID](https://en.wikipedia.org/wiki/RAID) volumes on each server. + +### Apache BookKeeper + +Pulsar uses a system called [Apache BookKeeper](http://bookkeeper.apache.org/) for persistent message storage. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) (WAL) system that provides a number of crucial advantages for Pulsar: + +* It enables Pulsar to utilize many independent logs, called [ledgers](#ledgers). Multiple ledgers can be created for topics over time. +* It offers very efficient storage for sequential data that handles entry replication. +* It guarantees read consistency of ledgers in the presence of various system failures. +* It offers even distribution of I/O across bookies. +* It's horizontally scalable in both capacity and throughput. Capacity can be immediately increased by adding more bookies to a cluster. +* Bookies are designed to handle thousands of ledgers with concurrent reads and writes. By using multiple disk devices---one for journal and another for general storage--bookies are able to isolate the effects of read operations from the latency of ongoing write operations. + +In addition to message data, *cursors* are also persistently stored in BookKeeper. Cursors are [subscription](reference-terminology.md#subscription) positions for [consumers](reference-terminology.md#consumer). BookKeeper enables Pulsar to store consumer position in a scalable fashion. + +At the moment, Pulsar supports persistent message storage. This accounts for the `persistent` in all topic names. Here's an example: + +```http + +persistent://my-tenant/my-namespace/my-topic + +``` + +> Pulsar also supports ephemeral ([non-persistent](concepts-messaging.md#non-persistent-topics)) message storage. + + +You can see an illustration of how brokers and bookies interact in the diagram below: + +![Brokers and bookies](/assets/broker-bookie.png) + + +### Ledgers + +A ledger is an append-only data structure with a single writer that is assigned to multiple BookKeeper storage nodes, or bookies. Ledger entries are replicated to multiple bookies. Ledgers themselves have very simple semantics: + +* A Pulsar broker can create a ledger, append entries to the ledger, and close the ledger. +* After the ledger has been closed---either explicitly or because the writer process crashed---it can then be opened only in read-only mode. +* Finally, when entries in the ledger are no longer needed, the whole ledger can be deleted from the system (across all bookies). + +#### Ledger read consistency + +The main strength of Bookkeeper is that it guarantees read consistency in ledgers in the presence of failures. Since the ledger can only be written to by a single process, that process is free to append entries very efficiently, without need to obtain consensus. After a failure, the ledger will go through a recovery process that will finalize the state of the ledger and establish which entry was last committed to the log. After that point, all readers of the ledger are guaranteed to see the exact same content. + +#### Managed ledgers + +Given that Bookkeeper ledgers provide a single log abstraction, a library was developed on top of the ledger called the *managed ledger* that represents the storage layer for a single topic. A managed ledger represents the abstraction of a stream of messages with a single writer that keeps appending at the end of the stream and multiple cursors that are consuming the stream, each with its own associated position. + +Internally, a single managed ledger uses multiple BookKeeper ledgers to store the data. There are two reasons to have multiple ledgers: + +1. After a failure, a ledger is no longer writable and a new one needs to be created. +2. A ledger can be deleted when all cursors have consumed the messages it contains. This allows for periodic rollover of ledgers. + +### Journal storage + +In BookKeeper, *journal* files contain BookKeeper transaction logs. Before making an update to a [ledger](#ledgers), a bookie needs to ensure that a transaction describing the update is written to persistent (non-volatile) storage. A new journal file is created once the bookie starts or the older journal file reaches the journal file size threshold (configured using the [`journalMaxSizeMB`](reference-configuration.md#bookkeeper-journalMaxSizeMB) parameter). + +## Pulsar proxy + +One way for Pulsar clients to interact with a Pulsar [cluster](#clusters) is by connecting to Pulsar message [brokers](#brokers) directly. In some cases, however, this kind of direct connection is either infeasible or undesirable because the client doesn't have direct access to broker addresses. If you're running Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform, for example, then direct client connections to brokers are likely not possible. + +The **Pulsar proxy** provides a solution to this problem by acting as a single gateway for all of the brokers in a cluster. If you run the Pulsar proxy (which, again, is optional), all client connections with the Pulsar cluster will flow through the proxy rather than communicating with brokers. + +> For the sake of performance and fault tolerance, you can run as many instances of the Pulsar proxy as you'd like. + +Architecturally, the Pulsar proxy gets all the information it requires from ZooKeeper. When starting the proxy on a machine, you only need to provide ZooKeeper connection strings for the cluster-specific and instance-wide configuration store clusters. Here's an example: + +```bash + +$ bin/pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk-2 \ + --configuration-store-servers zk-0,zk-1,zk-2 + +``` + +> #### Pulsar proxy docs +> For documentation on using the Pulsar proxy, see the [Pulsar proxy admin documentation](administration-proxy.md). + + +Some important things to know about the Pulsar proxy: + +* Connecting clients don't need to provide *any* specific configuration to use the Pulsar proxy. You won't need to update the client configuration for existing applications beyond updating the IP used for the service URL (for example if you're running a load balancer over the Pulsar proxy). +* [TLS encryption](security-tls-transport.md) and [authentication](security-tls-authentication.md) is supported by the Pulsar proxy + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. + +You can use your own service discovery system if you'd like. If you use your own system, there is just one requirement: when a client performs an HTTP request to an endpoint, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +The diagram below illustrates Pulsar service discovery: + +![alt-text](/assets/pulsar-service-discovery.png) + +In this diagram, the Pulsar cluster is addressable via a single DNS name: `pulsar-cluster.acme.com`. A [Python client](client-libraries-python.md), for example, could access this Pulsar cluster like this: + +```python + +from pulsar import Client + +client = Client('pulsar://pulsar-cluster.acme.com:6650') + +``` + +:::note + +In Pulsar, each topic is handled by only one broker. Initial requests from a client to read, update or delete a topic are sent to a broker that may not be the topic owner. If the broker cannot handle the request for this topic, it redirects the request to the appropriate broker. + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-authentication.md b/site2/website/versioned_docs/version-2.9.x/concepts-authentication.md new file mode 100644 index 0000000000000..f6307890c904a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-authentication.md @@ -0,0 +1,9 @@ +--- +id: concepts-authentication +title: Authentication and Authorization +sidebar_label: "Authentication and Authorization" +original_id: concepts-authentication +--- + +Pulsar supports a pluggable [authentication](security-overview.md) mechanism which can be configured at the proxy and/or the broker. Pulsar also supports a pluggable [authorization](security-authorization.md) mechanism. These mechanisms work together to identify the client and its access rights on topics, namespaces and tenants. + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-clients.md b/site2/website/versioned_docs/version-2.9.x/concepts-clients.md new file mode 100644 index 0000000000000..4040624f7d636 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-clients.md @@ -0,0 +1,92 @@ +--- +id: concepts-clients +title: Pulsar Clients +sidebar_label: "Clients" +original_id: concepts-clients +--- + +Pulsar exposes a client API with language bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md), [C++](client-libraries-cpp.md) and [C#](client-libraries-dotnet.md). The client API optimizes and encapsulates Pulsar's client-broker communication protocol and exposes a simple and intuitive API for use by applications. + +Under the hood, the current official Pulsar client libraries support transparent reconnection and/or connection failover to brokers, queuing of messages until acknowledged by the broker, and heuristics such as connection retries with backoff. + +> **Custom client libraries** +> If you'd like to create your own client library, we recommend consulting the documentation on Pulsar's custom [binary protocol](developing-binary-protocol.md). + + +## Client setup phase + +Before an application creates a producer/consumer, the Pulsar client library needs to initiate a setup phase including two steps: + +1. The client attempts to determine the owner of the topic by sending an HTTP lookup request to the broker. The request could reach one of the active brokers which, by looking at the (cached) zookeeper metadata knows who is serving the topic or, in case nobody is serving it, tries to assign it to the least loaded broker. +1. Once the client library has the broker address, it creates a TCP connection (or reuse an existing connection from the pool) and authenticates it. Within this connection, client and broker exchange binary commands from a custom protocol. At this point the client sends a command to create producer/consumer to the broker, which will comply after having validated the authorization policy. + +Whenever the TCP connection breaks, the client immediately re-initiates this setup phase and keeps trying with exponential backoff to re-establish the producer or consumer until the operation succeeds. + +## Reader interface + +In Pulsar, the "standard" [consumer interface](concepts-messaging.md#consumers) involves using consumers to listen on [topics](reference-terminology.md#topic), process incoming messages, and finally acknowledge those messages when they are processed. Whenever a new subscription is created, it is initially positioned at the end of the topic (by default), and consumers associated with that subscription begin reading with the first message created afterwards. Whenever a consumer connects to a topic using a pre-existing subscription, it begins reading from the earliest message un-acked within that subscription. In summary, with the consumer interface, subscription cursors are automatically managed by Pulsar in response to [message acknowledgements](concepts-messaging.md#acknowledgement). + +The **reader interface** for Pulsar enables applications to manually manage cursors. When you use a reader to connect to a topic---rather than a consumer---you need to specify *which* message the reader begins reading from when it connects to a topic. When connecting to a topic, the reader interface enables you to begin with: + +* The **earliest** available message in the topic +* The **latest** available message in the topic +* Some other message between the earliest and the latest. If you select this option, you'll need to explicitly provide a message ID. Your application will be responsible for "knowing" this message ID in advance, perhaps fetching it from a persistent data store or cache. + +The reader interface is helpful for use cases like using Pulsar to provide effectively-once processing semantics for a stream processing system. For this use case, it's essential that the stream processing system be able to "rewind" topics to a specific message and begin reading there. The reader interface provides Pulsar clients with the low-level abstraction necessary to "manually position" themselves within a topic. + +Internally, the reader interface is implemented as a consumer using an exclusive, non-durable subscription to the topic with a randomly-allocated name. + +[ **IMPORTANT** ] + +Unlike subscription/consumer, readers are non-durable in nature and does not prevent data in a topic from being deleted, thus it is ***strongly*** advised that [data retention](cookbooks-retention-expiry.md) be configured. If data retention for a topic is not configured for an adequate amount of time, messages that the reader has not yet read might be deleted . This causes the readers to essentially skip messages. Configuring the data retention for a topic guarantees the reader with a certain duration to read a message. + +Please also note that a reader can have a "backlog", but the metric is only used for users to know how behind the reader is. The metric is not considered for any backlog quota calculations. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-reader-consumer-interfaces.png) + +Here's a Java example that begins reading from the earliest available message on a topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageId; +import org.apache.pulsar.client.api.Reader; + +// Create a reader on a topic and for a specific message (and onward) +Reader reader = pulsarClient.newReader() + .topic("reader-api-test") + .startMessageId(MessageId.earliest) + .create(); + +while (true) { + Message message = reader.readNext(); + + // Process the message +} + +``` + +To create a reader that reads from the latest available message: + +```java + +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.latest) + .create(); + +``` + +To create a reader that reads from some message between the earliest and the latest: + +```java + +byte[] msgIdBytes = // Some byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-messaging.md b/site2/website/versioned_docs/version-2.9.x/concepts-messaging.md new file mode 100644 index 0000000000000..85c52d175b38d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-messaging.md @@ -0,0 +1,714 @@ +--- +id: concepts-messaging +title: Messaging +sidebar_label: "Messaging" +original_id: concepts-messaging +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern (often abbreviated to pub-sub). In this pattern, [producers](#producers) publish messages to [topics](#topics); [consumers](#consumers) [subscribe](#subscription-modes) to those topics, process incoming messages, and send [acknowledgements](#acknowledgement) to the broker when processing is finished. + +When a subscription is created, Pulsar [retains](concepts-architecture-overview.md#persistent-storage) all messages, even if the consumer is disconnected. The retained messages are discarded only when a consumer acknowledges that all these messages are processed successfully. + +If the consumption of a message fails and you want this message to be consumed again, then you can enable the automatic redelivery of this message by sending a [negative acknowledgement](#negative-acknowledgement) to the broker or enabling the [acknowledgement timeout](#acknowledgement-timeout) for unacknowledged messages. + +## Messages + +Messages are the basic "unit" of Pulsar. The following table lists the components of messages. + +Component | Description +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages contain raw bytes, although message data can also conform to data [schemas](schema-get-started.md). +Key | Messages are optionally tagged with keys, which is useful for things like [topic compaction](concepts-topic-compaction.md). +Properties | An optional key/value map of user-defined properties. +Producer name | The name of the producer who produces the message. If you do not specify a producer name, the default name is used. +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. The sequence ID of the message is its order in that sequence. +Publish time | The timestamp of when the message is published. The timestamp is automatically applied by the producer. +Event time | An optional timestamp attached to a message by applications. For example, applications attach a timestamp on when the message is processed. If nothing is set to event time, the value is `0`. +TypedMessageBuilder | It is used to construct a message. You can set message properties such as the message key, message value with `TypedMessageBuilder`.
    When you set `TypedMessageBuilder`, set the key as a string. If you set the key as other types, for example, an AVRO object, the key is sent as bytes, and it is difficult to get the AVRO object back on the consumer. + +The default size of a message is 5 MB. You can configure the max size of a message with the following configurations. + +- In the `broker.conf` file. + + ```bash + + # The max size of a message (in bytes). + maxMessageSize=5242880 + + ``` + +- In the `bookkeeper.conf` file. + + ```bash + + # The max size of the netty frame (in bytes). Any messages received larger than this value are rejected. The default value is 5 MB. + nettyMaxFrameSizeBytes=5253120 + + ``` + +> For more information on Pulsar messages, see Pulsar [binary protocol](developing-binary-protocol.md). + +## Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar [broker](reference-terminology.md#broker). The Pulsar broker processes the messages. + +### Send modes + +Producers send messages to brokers synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|-----------| +| Sync send | The producer waits for an acknowledgement from the broker after sending every message. If the acknowledgment is not received, the producer treats the sending operation as a failure. | +| Async send | The producer puts a message in a blocking queue and returns immediately. The client library sends the message to the broker in the background. If the queue is full (you can [configure](reference-configuration.md#broker) the maximum size), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. | + +### Access mode + +You can have different types of access modes on topics for producers. + +|Access mode | Description +|---|--- +`Shared`|Multiple producers can publish on a topic.

    This is the **default** setting. +`Exclusive`|Only one producer can publish on a topic.

    If there is already a producer connected, other producers trying to publish on this topic get errors immediately.

    The “old” producer is evicted and a “new” producer is selected to be the next exclusive producer if the “old” producer experiences a network partition with the broker. +`WaitForExclusive`|If there is already a producer connected, the producer creation is pending (rather than timing out) until the producer gets the `Exclusive` access.

    The producer that succeeds in becoming the exclusive one is treated as the leader. Consequently, if you want to implement the leader election scheme for your application, you can use this access mode. + +:::note + +Once an application creates a producer with `Exclusive` or `WaitForExclusive` access mode successfully, the instance of this application is guaranteed to be the **only writer** to the topic. Any other producers trying to produce messages on this topic will either get errors immediately or have to wait until they get the `Exclusive` access. +For more information, see [PIP 68: Exclusive Producer](https://github.com/apache/pulsar/wiki/PIP-68:-Exclusive-Producer). + +::: + +You can set producer access mode through Java Client API. For more information, see `ProducerAccessMode` in [ProducerBuilder.java](https://github.com/apache/pulsar/blob/fc5768ca3bbf92815d142fe30e6bfad70a1b4fc6/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/ProducerBuilder.java) file. + + +### Compression + +You can compress messages published by producers during transportation. Pulsar currently supports the following types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) +* [ZSTD](https://facebook.github.io/zstd/) +* [SNAPPY](https://google.github.io/snappy/) + +### Batching + +When batching is enabled, the producer accumulates and sends a batch of messages in a single request. The batch size is defined by the maximum number of messages and the maximum publish latency. Therefore, the backlog size represents the total number of batches instead of the total number of messages. + +In Pulsar, batches are tracked and stored as single units rather than as individual messages. Consumer unbundles a batch into individual messages. However, scheduled messages (configured through the `deliverAt` or the `deliverAfter` parameter) are always sent as individual messages even batching is enabled. + +In general, a batch is acknowledged when all of its messages are acknowledged by a consumer. It means that when **not all** batch messages are acknowledged, then unexpected failures, negative acknowledgements, or acknowledgement timeouts can result in a redelivery of all messages in this batch. + +To avoid redelivering acknowledged messages in a batch to the consumer, Pulsar introduces batch index acknowledgement since Pulsar 2.6.0. When batch index acknowledgement is enabled, the consumer filters out the batch index that has been acknowledged and sends the batch index acknowledgement request to the broker. The broker maintains the batch index acknowledgement status and tracks the acknowledgement status of each batch index to avoid dispatching acknowledged messages to the consumer. The batch is deleted when all indices of the messages in it are acknowledged. + +By default, batch index acknowledgement is disabled (`acknowledgmentAtBatchIndexLevelEnabled=false`). You can enable batch index acknowledgement by setting the `acknowledgmentAtBatchIndexLevelEnabled` parameter to `true` at the broker side. Enabling batch index acknowledgement results in more memory overheads. + +### Chunking +Before you enable chunking, read the following instructions. +- Batching and chunking cannot be enabled simultaneously. To enable chunking, you must disable batching in advance. +- Chunking is only supported for persisted topics. +- Chunking is only supported for the exclusive and failover subscription modes. + +When chunking is enabled (`chunkingEnabled=true`), if the message size is greater than the allowed maximum publish-payload size, the producer splits the original message into chunked messages and publishes them with chunked metadata to the broker separately and in order. At the broker side, the chunked messages are stored in the managed-ledger in the same way as that of ordinary messages. The only difference is that the consumer needs to buffer the chunked messages and combines them into the real message when all chunked messages have been collected. The chunked messages in the managed-ledger can be interwoven with ordinary messages. If producer fails to publish all the chunks of a message, the consumer can expire incomplete chunks if consumer fail to receive all chunks in expire time. By default, the expire time is set to one minute. + +The consumer consumes the chunked messages and buffers them until the consumer receives all the chunks of a message. And then the consumer stitches chunked messages together and places them into the receiver-queue. Clients consume messages from the receiver-queue. Once the consumer consumes the entire large message and acknowledges it, the consumer internally sends acknowledgement of all the chunk messages associated to that large message. You can set the `maxPendingChunkedMessage` parameter on the consumer. When the threshold is reached, the consumer drops the unchunked messages by silently acknowledging them or asking the broker to redeliver them later by marking them unacknowledged. + +The broker does not require any changes to support chunking for non-shared subscription. The broker only uses `chunkedMessageRate` to record chunked message rate on the topic. + +#### Handle chunked messages with one producer and one ordered consumer + +As shown in the following figure, when a topic has one producer which publishes large message payload in chunked messages along with regular non-chunked messages. The producer publishes message M1 in three chunks M1-C1, M1-C2 and M1-C3. The broker stores all the three chunked messages in the managed-ledger and dispatches to the ordered (exclusive/failover) consumer in the same order. The consumer buffers all the chunked messages in memory until it receives all the chunked messages, combines them into one message and then hands over the original message M1 to the client. + +![](/assets/chunking-01.png) + +#### Handle chunked messages with multiple producers and one ordered consumer + +When multiple publishers publish chunked messages into a single topic, the broker stores all the chunked messages coming from different publishers in the same managed-ledger. As shown below, Producer 1 publishes message M1 in three chunks M1-C1, M1-C2 and M1-C3. Producer 2 publishes message M2 in three chunks M2-C1, M2-C2 and M2-C3. All chunked messages of the specific message are still in order but might not be consecutive in the managed-ledger. This brings some memory pressure to the consumer because the consumer keeps separate buffer for each large message to aggregate all chunks of the large message and combine them into one message. + +![](/assets/chunking-02.png) + +## Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +A consumer sends a [flow permit request](developing-binary-protocol.md#flow-control) to a broker to get messages. There is a queue at the consumer side to receive messages pushed from the broker. You can configure the queue size with the [`receiverQueueSize`](client-libraries-java.md#configure-consumer) parameter. The default size is `1000`). Each time `consumer.receive()` is called, a message is dequeued from the buffer. + +### Receive modes + +Messages are received from [brokers](reference-terminology.md#broker) either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive is blocked until a message is available. | +| Async receive | An async receive returns immediately with a future value—for example, a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java—that completes once a new message is available. | + +### Listeners + +Client libraries provide listener implementation for consumers. For example, the [Java client](client-libraries-java.md) provides a {@inject: javadoc:MesssageListener:/client/org/apache/pulsar/client/api/MessageListener} interface. In this interface, the `received` method is called whenever a new message is received. + +### Acknowledgement + +The consumer sends an acknowledgement request to the broker after it consumes a message successfully. Then, this consumed message will be permanently stored, and be deleted only after all the subscriptions have acknowledged it. If you want to store the messages that have been acknowledged by a consumer, you need to configure the [message retention policy](concepts-messaging.md#message-retention-and-expiry). + +For batch messages, you can enable batch index acknowledgement to avoid dispatching acknowledged messages to the consumer. For details about batch index acknowledgement, see [batching](#batching). + +Messages can be acknowledged in one of the following two ways: + +- Being acknowledged individually. With individual acknowledgement, the consumer acknowledges each message and sends an acknowledgement request to the broker. +- Being acknowledged cumulatively. With cumulative acknowledgement, the consumer **only** acknowledges the last message it received. All messages in the stream up to (and including) the provided message are not redelivered to that consumer. + +If you want to acknowledge messages individually, you can use the following API. + +```java + +consumer.acknowledge(msg); + +``` + +If you want to acknowledge messages cumulatively, you can use the following API. + +```java + +consumer.acknowledgeCumulative(msg); + +``` + +:::note + +Cumulative acknowledgement cannot be used in the [shared subscription mode](#subscription-modes), because the shared subscription mode involves multiple consumers who have access to the same subscription. In the shared subscription mode, messages are acknowledged individually. + +::: + +### Negative acknowledgement + +When a consumer fails to consume a message and intends to consume it again, this consumer should send a negative acknowledgement to the broker. Then, the broker will redeliver this message to the consumer. + +Messages are negatively acknowledged individually or cumulatively, depending on the consumption subscription mode. + +In the exclusive and failover subscription modes, consumers only negatively acknowledge the last message they receive. + +In the shared and Key_Shared subscription modes, consumers can negatively acknowledge messages individually. + +Be aware that negative acknowledgments on ordered subscription types, such as Exclusive, Failover and Key_Shared, might cause failed messages being sent to consumers out of the original order. + +If you want to acknowledge messages negatively, you can use the following API. + +```java + +//With calling this api, messages are negatively acknowledged +consumer.negativeAcknowledge(msg); + +``` + +:::note + +If batching is enabled, all messages in one batch are redelivered to the consumer. + +::: + +### Acknowledgement timeout + +If a message is not consumed successfully, and you want the broker to redeliver this message automatically, then you can enable automatic redelivery mechanism for unacknowledged messages. With automatic redelivery enabled, the client tracks the unacknowledged messages within the entire `acktimeout` time range, and sends a `redeliver unacknowledged messages` request to the broker automatically when the acknowledgement timeout is specified. + +:::note + +- If batching is enabled, all messages in one batch are redelivered to the consumer. +- The negative acknowledgement is preferable over the acknowledgement timeout, since negative acknowledgement controls the redelivery of individual messages more precisely and avoids invalid redeliveries when the message processing time exceeds the acknowledgement timeout. + +::: + +### Dead letter topic + +Dead letter topic enables you to consume new messages when some messages cannot be consumed successfully by a consumer. In this mechanism, messages that are failed to be consumed are stored in a separate topic, which is called dead letter topic. You can decide how to handle messages in the dead letter topic. + +The following example shows how to enable dead letter topic in a Java client using the default dead letter topic: + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` + +The default dead letter topic uses this format: + +``` + +--DLQ + +``` + +If you want to specify the name of the dead letter topic, use this Java client example: + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("your-topic-name") + .build()) + .subscribe(); + +``` + +Dead letter topic depends on message redelivery. Messages are redelivered either due to [acknowledgement timeout](#acknowledgement-timeout) or [negative acknowledgement](#negative-acknowledgement). If you are going to use negative acknowledgement on a message, make sure it is negatively acknowledged before the acknowledgement timeout. + +:::note + +Currently, dead letter topic is enabled in the Shared and Key_Shared subscription modes. + +::: + +### Retry letter topic + +For many online business systems, a message is re-consumed due to exception occurs in the business logic processing. To configure the delay time for re-consuming the failed messages, you can configure the producer to send messages to both the business topic and the retry letter topic, and enable automatic retry on the consumer. When automatic retry is enabled on the consumer, a message is stored in the retry letter topic if the messages are not consumed, and therefore the consumer automatically consumes the failed messages from the retry letter topic after a specified delay time. + +By default, automatic retry is disabled. You can set `enableRetry` to `true` to enable automatic retry on the consumer. + +This example shows how to consume messages from a retry letter topic. + +```java + +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .enableRetry(true) + .receiverQueueSize(100) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .retryLetterTopic("persistent://my-property/my-ns/my-subscription-custom-Retry") + .build()) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +``` + +If you want to put messages into a retrial queue, you can use the following API. + +```java + +consumer.reconsumeLater(msg,3,TimeUnit.SECONDS); + +``` + +## Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from producers to consumers. Topic names are URLs that have a well-defined structure: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](concepts-architecture-overview.md#persistent-storage) and [non-persistent](#non-persistent-topics). The default is persistent, so if you do not specify a type, the topic is persistent. With persistent topics, all messages are durably persisted on disks (if the broker is not standalone, messages are durably persisted on multiple disks), whereas data for non-persistent topics is not persisted to storage disks. +`tenant` | The topic tenant within the instance. Tenants are essential to multi-tenancy in Pulsar, and spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespaces) level. Each tenant has one or multiple namespaces. +`topic` | The final part of the name. Topic names have no special meaning in a Pulsar instance. + +> **No need to explicitly create new topics** +> You do not need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar creates that topic under the namespace provided in the [topic name](#topics) automatically. +> If no tenant or namespace is specified when a client creates a topic, the topic is created in the default tenant and namespace. You can also create a topic in a specified tenant and namespace, such as `persistent://my-tenant/my-namespace/my-topic`. `persistent://my-tenant/my-namespace/my-topic` means the `my-topic` topic is created in the `my-namespace` namespace of the `my-tenant` tenant. + +## Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant creates multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +## Subscriptions + +A subscription is a named configuration rule that determines how messages are delivered to consumers. Four subscription modes are available in Pulsar: [exclusive](#exclusive), [shared](#shared), [failover](#failover), and [key_shared](#key_shared). These modes are illustrated in the figure below. + +![Subscription modes](/assets/pulsar-subscription-types.png) + +> **Pub-Sub or Queuing** +> In Pulsar, you can use different subscriptions flexibly. +> * If you want to achieve traditional "fan-out pub-sub messaging" among consumers, specify a unique subscription name for each consumer. It is exclusive subscription mode. +> * If you want to achieve "message queuing" among consumers, share the same subscription name among multiple consumers(shared, failover, key_shared). +> * If you want to achieve both effects simultaneously, combine exclusive subscription mode with other subscription modes for consumers. + +### Consumerless Subscriptions and Their Corresponding Modes +When a subscription has no consumers, its subscription mode is undefined. A subscription's mode is defined when a consumer connects to the subscription, and the mode can be changed by restarting all consumers with a different configuration. + +### Exclusive + +In *exclusive* mode, only a single consumer is allowed to attach to the subscription. If multiple consumers subscribe to a topic using the same subscription, an error occurs. + +In the diagram below, only **Consumer A-0** is allowed to consume messages. + +> Exclusive mode is the default subscription mode. + +![Exclusive subscriptions](/assets/pulsar-exclusive-subscriptions.png) + +### Failover + +In *failover* mode, multiple consumers can attach to the same subscription. A master consumer is picked for non-partitioned topic or each partition of partitioned topic and receives messages. When the master consumer disconnects, all (non-acknowledged and subsequent) messages are delivered to the next consumer in line. + +For partitioned topics, broker will sort consumers by priority level and lexicographical order of consumer name. Then broker will try to evenly assigns topics to consumers with the highest priority level. + +For non-partitioned topic, broker will pick consumer in the order they subscribe to the non partitioned topic. + +In the diagram below, **Consumer-B-0** is the master consumer while **Consumer-B-1** would be the next consumer in line to receive messages if **Consumer-B-0** is disconnected. + +![Failover subscriptions](/assets/pulsar-failover-subscriptions.png) + +### Shared + +In *shared* or *round robin* mode, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram below, **Consumer-C-1** and **Consumer-C-2** are able to subscribe to the topic, but **Consumer-C-3** and others could as well. + +> **Limitations of shared mode** +> When using shared mode, be aware that: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with shared mode. + +![Shared subscriptions](/assets/pulsar-shared-subscriptions.png) + +### Key_Shared + +In *Key_Shared* mode, multiple consumers can attach to the same subscription. Messages are delivered in a distribution across consumers and message with same key or same ordering key are delivered to only one consumer. No matter how many times the message is re-delivered, it is delivered to the same consumer. When a consumer connected or disconnected will cause served consumer change for some key of message. + +![Key_Shared subscriptions](/assets/pulsar-key-shared-subscriptions.png) + +Note that when the consumers are using the Key_Shared subscription mode, you need to **disable batching** or **use key-based batching** for the producers. There are two reasons why the key-based batching is necessary for Key_Shared subscription mode: +1. The broker dispatches messages according to the keys of the messages, but the default batching approach might fail to pack the messages with the same key to the same batch. +2. Since it is the consumers instead of the broker who dispatch the messages from the batches, the key of the first message in one batch is considered as the key of all messages in this batch, thereby leading to context errors. + +The key-based batching aims at resolving the above-mentioned issues. This batching method ensures that the producers pack the messages with the same key to the same batch. The messages without a key are packed into one batch and this batch has no key. When the broker dispatches messages from this batch, it uses `NON_KEY` as the key. In addition, each consumer is associated with **only one** key and should receive **only one message batch** for the connected key. By default, you can limit batching by configuring the number of messages that producers are allowed to send. + +Below are examples of enabling the key-based batching under the Key_Shared subscription mode, with `client` being the Pulsar client that you created. + +````mdx-code-block + + + +``` + +Producer producer = client.newProducer() + .topic("my-topic") + .batcherBuilder(BatcherBuilder.KEY_BASED) + .create(); + +``` + + + + +``` + +ProducerConfiguration producerConfig; +producerConfig.setBatchingType(ProducerConfiguration::BatchingType::KeyBasedBatching); +Producer producer; +client.createProducer("my-topic", producerConfig, producer); + +``` + + + + +``` + +producer = client.create_producer(topic='my-topic', batching_type=pulsar.BatchingType.KeyBased) + +``` + + + + +```` + +> **Limitations of Key_Shared mode** +> When you use Key_Shared mode, be aware that: +> * You need to specify a key or orderingKey for messages. +> * You cannot use cumulative acknowledgment with Key_Shared mode. + +## Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces). + +When subscribing to multiple topics, the Pulsar client automatically makes a call to the Pulsar API to discover the topics that match the regex pattern/list, and then subscribe to all of them. If any of the topics do not exist, the consumer auto-subscribes to them once the topics are created. + +> **No ordering guarantees across multiple topics** +> When a producer sends messages to a single topic, all messages are guaranteed to be read from that topic in the same order. However, these guarantees do not hold across multiple topics. So when a producer sends message to multiple topics, the order in which messages are read from those topics is not guaranteed to be the same. + +The following are multi-topic subscription examples for Java. + +```java + +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(allTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(someTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +``` + +For code examples, see [Java](client-libraries-java.md#multi-topic-subscriptions). + +## Partitioned topics + +Normal topics are served only by a single broker, which limits the maximum throughput of the topic. *Partitioned topics* are a special type of topic that are handled by multiple brokers, thus allowing for higher throughput. + +A partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](/assets/partitioning.png) + +The **Topic1** topic has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines each message should be published to which partition, while the [subscription mode](#subscription-modes) determines which messages go to which consumers. + +Decisions about routing and subscription modes can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription modes work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} available: + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer will randomly pick one single partition and publish all the messages into that partition. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. +`CustomPartition` | Use custom message router implementation that will be called to determine the partition for a particular message. User can create a custom routing mode by using the [Java client](client-libraries-java.md) and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +### Ordering guarantee + +The ordering of messages is related to MessageRoutingMode and Message Key. Usually, user would want an ordering of Per-key-partition guarantee. + +If there is a key attached to message, the messages will be routed to corresponding partitions based on the hashing scheme specified by {@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} in {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder}, when using either `SinglePartition` or `RoundRobinPartition` mode. + +Ordering guarantee | Description | Routing Mode and Key +:------------------|:------------|:------------ +Per-key-partition | All the messages with the same key will be in order and be placed in same partition. | Use either `SinglePartition` or `RoundRobinPartition` mode, and Key is provided by each message. +Per-producer | All the messages from the same producer will be in order. | Use `SinglePartition` mode, and no Key is provided for each message. + +### Hashing scheme + +{@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} is an enum that represent sets of standard hashing functions available when choosing the partition to use for a particular message. + +There are 2 types of standard hashing functions available: `JavaStringHash` and `Murmur3_32Hash`. +The default hashing function for producer is `JavaStringHash`. +Please pay attention that `JavaStringHash` is not useful when producers can be from different multiple language clients, under this use case, it is recommended to use `Murmur3_32Hash`. + + + +## Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](concepts-architecture-overview.md#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](concepts-architecture-overview.md#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the `pulsar-admin topics` command. For more information, see [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). + +### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is delivered to connected brokers. Producers thus see comparatively low publish latency with non-persistent topic. + +### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription modes---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumers) for a non-persistent topic: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.newConsumer() + .topic(npTopic) + .subscriptionName(subscriptionName) + .subscribe(); + +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java + +Producer producer = client.newProducer() + .topic(npTopic) + .create(); + +``` + + +## System topic + +System topic is a predefined topic for internal use within Pulsar. It can be either persistent or non-persistent topic. + +System topics serve to implement certain features and eliminate dependencies on third-party components, such as transactions, heartbeat detections, topic-level policies, and resource group services. System topics empower the implementation of these features to be simplified, dependent, and flexible. Take heartbeat detections for example, you can leverage the system topic for healthcheck to internally enable producer/reader to procude/consume messages under the heartbeat namespace, which can detect whether the current service is still alive. + +There are diverse system topics depending on namespaces. The following table outlines the available system topics for each specific namespace. + +| Namespace | TopicName | Domain | Count | Usage | +|-----------|-----------|--------|-------|-------| +| pulsar/system | `transaction_coordinator_assign_${id}` | Persistent | Default 16 | Transaction coordinator | +| pulsar/system | `_transaction_log${tc_id}` | Persistent | Default 16 | Transaction log | +| pulsar/system | `resource-usage` | Non-persistent | Default 4 | Resource group service | +| host/port | `heartbeat` | Persistent | 1 | Heartbeat detection | +| User-defined-ns | [`__change_events`](concepts-multi-tenancy.md#namespace-change-events-and-topic-level-policies) | Persistent | Default 4 | Topic events | +| User-defined-ns | `__transaction_buffer_snapshot` | Persistent | One per namespace | Transaction buffer snapshots | +| User-defined-ns | `${topicName}__transaction_pending_ack` | Persistent | One per every topic subscription acknowledged with transactions | Acknowledgements with transactions | + +:::note + +* You cannot create any system topics. +* By default, system topics are disabled. To enable system topics, you need to change the following configurations in the `conf/broker.conf` or `conf/standalone.conf` file. + + ```conf + systemTopicEnabled=true + topicLevelPoliciesEnabled=true + ``` + +::: + + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](concepts-architecture-overview.md#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](/assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dictates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Message deduplication + +Message duplication occurs when a message is [persisted](concepts-architecture-overview.md#persistent-storage) by Pulsar more than once. Message deduplication is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, even if the message is received more than once. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](/assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](concepts-architecture-overview.md#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level or the topic level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the [broker](reference-terminology.md#broker) level, so you do not need to modify your Pulsar client code. Instead, you only need to make administrative changes. For details, see [Managing message deduplication](cookbooks-deduplication.md). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide effectively-once processing semantics. Messaging systems that do not offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> You can find more in-depth information in [this post](https://www.splunk.com/en_us/blog/it/exactly-once-is-not-exactly-the-same.html). + +## Delayed message delivery +Delayed message delivery enables you to consume a message later rather than immediately. In this mechanism, a message is stored in BookKeeper, `DelayedDeliveryTracker` maintains the time index(time -> messageId) in memory after published to a broker, and it is delivered to a consumer once the specific delayed time is passed. + +Delayed message delivery only works in Shared subscription mode. In Exclusive and Failover subscription modes, the delayed message is dispatched immediately. + +The diagram below illustrates the concept of delayed message delivery: + +![Delayed Message Delivery](/assets/message_delay.png) + +A broker saves a message without any check. When a consumer consumes a message, if the message is set to delay, then the message is added to `DelayedDeliveryTracker`. A subscription checks and gets timeout messages from `DelayedDeliveryTracker`. + +### Broker +Delayed message delivery is enabled by default. You can change it in the broker configuration file as below: + +``` + +# Whether to enable the delayed delivery for messages. +# If disabled, messages are immediately delivered and there is no tracking overhead. +delayedDeliveryEnabled=true + +# Control the ticking time for the retry of delayed message delivery, +# affecting the accuracy of the delivery time compared to the scheduled time. +# Default is 1 second. +delayedDeliveryTickTimeMillis=1000 + +``` + +### Producer +The following is an example of delayed message delivery for a producer in Java: + +```java + +// message to be delivered at the configured delay interval +producer.newMessage().deliverAfter(3L, TimeUnit.Minute).value("Hello Pulsar!").send(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-multi-tenancy.md b/site2/website/versioned_docs/version-2.9.x/concepts-multi-tenancy.md new file mode 100644 index 0000000000000..93a59557b2efc --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-multi-tenancy.md @@ -0,0 +1,67 @@ +--- +id: concepts-multi-tenancy +title: Multi Tenancy +sidebar_label: "Multi Tenancy" +original_id: concepts-multi-tenancy +--- + +Pulsar was created from the ground up as a multi-tenant system. To support multi-tenancy, Pulsar has a concept of tenants. Tenants can be spread across clusters and can each have their own [authentication and authorization](security-overview.md) scheme applied to them. They are also the administrative unit at which storage quotas, [message TTL](cookbooks-retention-expiry.md#time-to-live-ttl), and isolation policies can be managed. + +The multi-tenant nature of Pulsar is reflected mostly visibly in topic URLs, which have this structure: + +```http + +persistent://tenant/namespace/topic + +``` + +As you can see, the tenant is the most basic unit of categorization for topics (more fundamental than the namespace and topic name). + +## Tenants + +To each tenant in a Pulsar instance you can assign: + +* An [authorization](security-authorization.md) scheme +* The set of [clusters](reference-terminology.md#cluster) to which the tenant's configuration applies + +## Namespaces + +Tenants and namespaces are two key concepts of Pulsar to support multi-tenancy. + +* Pulsar is provisioned for specified tenants with appropriate capacity allocated to the tenant. +* A namespace is the administrative unit nomenclature within a tenant. The configuration policies set on a namespace apply to all the topics created in that namespace. A tenant may create multiple namespaces via self-administration using the REST API and the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. For instance, a tenant with different applications can create a separate namespace for each application. + +Names for topics in the same namespace will look like this: + +```http + +persistent://tenant/app1/topic-1 + +persistent://tenant/app1/topic-2 + +persistent://tenant/app1/topic-3 + +``` + +### Namespace change events and topic-level policies + +Pulsar is a multi-tenant event streaming system. Administrators can manage the tenants and namespaces by setting policies at different levels. However, the policies, such as retention policy and storage quota policy, are only available at a namespace level. In many use cases, users need to set a policy at the topic level. The namespace change events approach is proposed for supporting topic-level policies in an efficient way. In this approach, Pulsar is used as an event log to store namespace change events (such as topic policy changes). This approach has a few benefits: +- Avoid using ZooKeeper and introducing more loads to ZooKeeper. +- Use Pulsar as an event log for propagating the policy cache. It can scale efficiently. +- Use Pulsar SQL to query the namespace changes and audit the system. + +Each namespace has a [system topic](concepts-messaging.md#system-topic) named `__change_events`. This system topic stores change events for a given namespace. The following figure illustrates how to leverage it to update topic-level policies. + +![Leverage the system topic to update topic-level policies](/assets/system-topic-for-topic-level-policies.svg) + +1. Pulsar Admin clients communicate with the Admin Restful API to update topic-level policies. +2. Any broker that receives the Admin HTTP request publishes a topic policy change event to the corresponding system topic (`__change_events`) of the namespace. +3. Each broker that owns a namespace bundle(s) subscribes to the system topic (`__change_events`) to receive the change events of the namespace. +4. Each broker applies the change events to its policy cache. +5. Once the policy cache is updated, the broker sends the response back to the Pulsar Admin clients. + +:::note + +By default, the system topic is disabled. To enable topic-level policy (`topicLevelPoliciesEnabled`=`true`), you need to enable the system topic by setting `systemtopicenabled` to `true` in the `conf/broker.conf` or `conf/standalone.conf` file. + +::: \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-multiple-advertised-listeners.md b/site2/website/versioned_docs/version-2.9.x/concepts-multiple-advertised-listeners.md new file mode 100644 index 0000000000000..f2e1ae0aadc7c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-multiple-advertised-listeners.md @@ -0,0 +1,44 @@ +--- +id: concepts-multiple-advertised-listeners +title: Multiple advertised listeners +sidebar_label: "Multiple advertised listeners" +original_id: concepts-multiple-advertised-listeners +--- + +When a Pulsar cluster is deployed in the production environment, it may require to expose multiple advertised addresses for the broker. For example, when you deploy a Pulsar cluster in Kubernetes and want other clients, which are not in the same Kubernetes cluster, to connect to the Pulsar cluster, you need to assign a broker URL to external clients. But clients in the same Kubernetes cluster can still connect to the Pulsar cluster through the internal network of Kubernetes. + +## Advertised listeners + +To ensure clients in both internal and external networks can connect to a Pulsar cluster, Pulsar introduces `advertisedListeners` and `internalListenerName` configuration options into the [broker configuration file](reference-configuration.md#broker) to ensure that the broker supports exposing multiple advertised listeners and support the separation of internal and external network traffic. + +- The `advertisedListeners` is used to specify multiple advertised listeners. The broker uses the listener as the broker identifier in the load manager and the bundle owner data. The `advertisedListeners` is formatted as `:pulsar://:, :pulsar+ssl://:`. You can set up the `advertisedListeners` like +`advertisedListeners=internal:pulsar://192.168.1.11:6660,internal:pulsar+ssl://192.168.1.11:6651`. + +- The `internalListenerName` is used to specify the internal service URL that the broker uses. You can specify the `internalListenerName` by choosing one of the `advertisedListeners`. The broker uses the listener name of the first advertised listener as the `internalListenerName` if the `internalListenerName` is absent. + +After setting up the `advertisedListeners`, clients can choose one of the listeners as the service URL to create a connection to the broker as long as the network is accessible. However, if the client creates producers or consumer on a topic, the client must send a lookup requests to the broker for getting the owner broker, then connect to the owner broker to publish messages or consume messages. Therefore, You must allow the client to get the corresponding service URL with the same advertised listener name as the one used by the client. This helps keep client-side simple and secure. + +## Use multiple advertised listeners + +This example shows how a Pulsar client uses multiple advertised listeners. + +1. Configure multiple advertised listeners in the broker configuration file. + +```shell + +advertisedListeners={listenerName}:pulsar://xxxx:6650, +{listenerName}:pulsar+ssl://xxxx:6651 + +``` + +2. Specify the listener name for the client. + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://xxxx:6650") + .listenerName("external") + .build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-overview.md b/site2/website/versioned_docs/version-2.9.x/concepts-overview.md new file mode 100644 index 0000000000000..c643aa0ce7bbc --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-overview.md @@ -0,0 +1,31 @@ +--- +id: concepts-overview +title: Pulsar Overview +sidebar_label: "Overview" +original_id: concepts-overview +--- + +Pulsar is a multi-tenant, high-performance solution for server-to-server messaging. Originally developed by Yahoo, Pulsar is under the stewardship of the [Apache Software Foundation](https://www.apache.org/). + +Key features of Pulsar are listed below: + +* Native support for multiple clusters in a Pulsar instance, with seamless [geo-replication](administration-geo.md) of messages across clusters. +* Very low publish and end-to-end latency. +* Seamless scalability to over a million topics. +* A simple [client API](concepts-clients.md) with bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). +* Multiple [subscription modes](concepts-messaging.md#subscription-modes) ([exclusive](concepts-messaging.md#exclusive), [shared](concepts-messaging.md#shared), and [failover](concepts-messaging.md#failover)) for topics. +* Guaranteed message delivery with [persistent message storage](concepts-architecture-overview.md#persistent-storage) provided by [Apache BookKeeper](http://bookkeeper.apache.org/). +* A serverless light-weight computing framework [Pulsar Functions](functions-overview.md) offers the capability for stream-native data processing. +* A serverless connector framework [Pulsar IO](io-overview.md), which is built on Pulsar Functions, makes it easier to move data in and out of Apache Pulsar. +* [Tiered Storage](concepts-tiered-storage.md) offloads data from hot/warm storage to cold/long-term storage (such as S3 and GCS) when the data is aging out. + +## Contents + +- [Messaging Concepts](concepts-messaging.md) +- [Architecture Overview](concepts-architecture-overview.md) +- [Pulsar Clients](concepts-clients.md) +- [Geo Replication](concepts-replication.md) +- [Multi Tenancy](concepts-multi-tenancy.md) +- [Authentication and Authorization](concepts-authentication.md) +- [Topic Compaction](concepts-topic-compaction.md) +- [Tiered Storage](concepts-tiered-storage.md) diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-proxy-sni-routing.md b/site2/website/versioned_docs/version-2.9.x/concepts-proxy-sni-routing.md new file mode 100644 index 0000000000000..7eee6df5512a2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-proxy-sni-routing.md @@ -0,0 +1,180 @@ +--- +id: concepts-proxy-sni-routing +title: Proxy support with SNI routing +sidebar_label: "Proxy support with SNI routing" +original_id: concepts-proxy-sni-routing +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +A proxy server is an intermediary server that forwards requests from multiple clients to different servers across the Internet. The proxy server acts as a "traffic cop" in both forward and reverse proxy scenarios, and benefits your system such as load balancing, performance, security, auto-scaling, and so on. + +The proxy in Pulsar acts as a reverse proxy, and creates a gateway in front of brokers. Proxies such as Apache Traffic Server (ATS), HAProxy, Nginx, and Envoy are not supported in Pulsar. These proxy-servers support **SNI routing**. SNI routing is used to route traffic to a destination without terminating the SSL connection. Layer 4 routing provides greater transparency because the outbound connection is determined by examining the destination address in the client TCP packets. + +Pulsar clients (Java, C++, Python) support [SNI routing protocol](https://github.com/apache/pulsar/wiki/PIP-60:-Support-Proxy-server-with-SNI-routing), so you can connect to brokers through the proxy. This document walks you through how to set up the ATS proxy, enable SNI routing, and connect Pulsar client to the broker through the ATS proxy. + +## ATS-SNI Routing in Pulsar +To support [layer-4 SNI routing](https://docs.trafficserver.apache.org/en/latest/admin-guide/layer-4-routing.en.html) with ATS, the inbound connection must be a TLS connection. Pulsar client supports SNI routing protocol on TLS connection, so when Pulsar clients connect to broker through ATS proxy, Pulsar uses ATS as a reverse proxy. + +Pulsar supports SNI routing for geo-replication, so brokers can connect to brokers in other clusters through the ATS proxy. + +This section explains how to set up and use ATS as a reverse proxy, so Pulsar clients can connect to brokers through the ATS proxy using the SNI routing protocol on TLS connection. + +### Set up ATS Proxy for layer-4 SNI routing +To support layer 4 SNI routing, you need to configure the `records.conf` and `ssl_server_name.conf` files. + +![Pulsar client SNI](/assets/pulsar-sni-client.png) + +The [records.config](https://docs.trafficserver.apache.org/en/latest/admin-guide/files/records.config.en.html) file is located in the `/usr/local/etc/trafficserver/` directory by default. The file lists configurable variables used by the ATS. + +To configure the `records.config` files, complete the following steps. +1. Update TLS port (`http.server_ports`) on which proxy listens, and update proxy certs (`ssl.client.cert.path` and `ssl.client.cert.filename`) to secure TLS tunneling. +2. Configure server ports (`http.connect_ports`) used for tunneling to the broker. If Pulsar brokers are listening on `4443` and `6651` ports, add the brokers service port in the `http.connect_ports` configuration. + +The following is an example. + +``` + +# PROXY TLS PORT +CONFIG proxy.config.http.server_ports STRING 4443:ssl 4080 +# PROXY CERTS FILE PATH +CONFIG proxy.config.ssl.client.cert.path STRING /proxy-cert.pem +# PROXY KEY FILE PATH +CONFIG proxy.config.ssl.client.cert.filename STRING /proxy-key.pem + + +# The range of origin server ports that can be used for tunneling via CONNECT. # Traffic Server allows tunnels only to the specified ports. Supports both wildcards (*) and ranges (e.g. 0-1023). +CONFIG proxy.config.http.connect_ports STRING 4443 6651 + +``` + +The `ssl_server_name` file is used to configure TLS connection handling for inbound and outbound connections. The configuration is determined by the SNI values provided by the inbound connection. The file consists of a set of configuration items, and each is identified by an SNI value (`fqdn`). When an inbound TLS connection is made, the SNI value from the TLS negotiation is matched with the items specified in this file. If the values match, the values specified in that item override the default values. + +The following example shows mapping of the inbound SNI hostname coming from the client, and the actual broker service URL where request should be redirected. For example, if the client sends the SNI header `pulsar-broker1`, the proxy creates a TLS tunnel by redirecting request to the `pulsar-broker1:6651` service URL. + +``` + +server_config = { + { + fqdn = 'pulsar-broker-vip', + # Forward to Pulsar broker which is listening on 6651 + tunnel_route = 'pulsar-broker-vip:6651' + }, + { + fqdn = 'pulsar-broker1', + # Forward to Pulsar broker-1 which is listening on 6651 + tunnel_route = 'pulsar-broker1:6651' + }, + { + fqdn = 'pulsar-broker2', + # Forward to Pulsar broker-2 which is listening on 6651 + tunnel_route = 'pulsar-broker2:6651' + }, +} + +``` + +After you configure the `ssl_server_name.config` and `records.config` files, the ATS-proxy server handles SNI routing and creates TCP tunnel between the client and the broker. + +### Configure Pulsar-client with SNI routing +ATS SNI-routing works only with TLS. You need to enable TLS for the ATS proxy and brokers first, configure the SNI routing protocol, and then connect Pulsar clients to brokers through ATS proxy. Pulsar clients support SNI routing by connecting to the proxy, and sending the target broker URL to the SNI header. This process is processed internally. You only need to configure the following proxy configuration initially when you create a Pulsar client to use the SNI routing protocol. + +````mdx-code-block + + + + +```java + +String brokerServiceUrl = “pulsar+ssl://pulsar-broker-vip:6651/”; +String proxyUrl = “pulsar+ssl://ats-proxy:443”; +ClientBuilder clientBuilder = PulsarClient.builder() + .serviceUrl(brokerServiceUrl) + .tlsTrustCertsFilePath(TLS_TRUST_CERT_FILE_PATH) + .enableTls(true) + .allowTlsInsecureConnection(false) + .proxyServiceUrl(proxyUrl, ProxyProtocol.SNI) + .operationTimeout(1000, TimeUnit.MILLISECONDS); + +Map authParams = new HashMap(); +authParams.put("tlsCertFile", TLS_CLIENT_CERT_FILE_PATH); +authParams.put("tlsKeyFile", TLS_CLIENT_KEY_FILE_PATH); +clientBuilder.authentication(AuthenticationTls.class.getName(), authParams); + +PulsarClient pulsarClient = clientBuilder.build(); + +``` + + + + +```c++ + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://ats-proxy:443", config); + +``` + + + + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://ats-proxy:443", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + + + + +```` + +### Pulsar geo-replication with SNI routing +You can use the ATS proxy for geo-replication. Pulsar brokers can connect to brokers in geo-replication by using SNI routing. To enable SNI routing for broker connection cross clusters, you need to configure SNI proxy URL to the cluster metadata. If you have configured SNI proxy URL in the cluster metadata, you can connect to broker cross clusters through the proxy over SNI routing. + +![Pulsar client SNI](/assets/pulsar-sni-geo.png) + +In this example, a Pulsar cluster is deployed into two separate regions, `us-west` and `us-east`. Both regions are configured with ATS proxy, and brokers in each region run behind the ATS proxy. We configure the cluster metadata for both clusters, so brokers in one cluster can use SNI routing and connect to brokers in other clusters through the ATS proxy. + +(a) Configure the cluster metadata for `us-east` with `us-east` broker service URL and `us-east` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://east-broker-vip:6651 \ +--url http://east-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://east-ats-proxy:443 + +``` + +(b) Configure the cluster metadata for `us-west` with `us-west` broker service URL and `us-west` ATS proxy URL with SNI proxy-protocol. + +``` + +./pulsar-admin clusters update \ +--broker-url-secure pulsar+ssl://west-broker-vip:6651 \ +--url http://west-broker-vip:8080 \ +--proxy-protocol SNI \ +--proxy-url pulsar+ssl://west-ats-proxy:443 + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-replication.md b/site2/website/versioned_docs/version-2.9.x/concepts-replication.md new file mode 100644 index 0000000000000..799f0eb4d92c6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-replication.md @@ -0,0 +1,9 @@ +--- +id: concepts-replication +title: Geo Replication +sidebar_label: "Geo Replication" +original_id: concepts-replication +--- + +Pulsar enables messages to be produced and consumed in different geo-locations. For instance, your application may be publishing data in one region or market and you would like to process it for consumption in other regions or markets. [Geo-replication](administration-geo.md) in Pulsar enables you to do that. + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-tiered-storage.md b/site2/website/versioned_docs/version-2.9.x/concepts-tiered-storage.md new file mode 100644 index 0000000000000..f6988e53a8cd4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-tiered-storage.md @@ -0,0 +1,18 @@ +--- +id: concepts-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: concepts-tiered-storage +--- + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from BookKeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](/assets/pulsar-tiered-storage.png) + +> Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3, Google Cloud Storage (GCS), and filesystem for [long term store](https://pulsar.apache.org/docs/en/cookbooks-tiered-storage/). Offloading to long term storage triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on BookKeeper, and the broker will copy the backlog data to long term storage. The original data will then be deleted from BookKeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-topic-compaction.md b/site2/website/versioned_docs/version-2.9.x/concepts-topic-compaction.md new file mode 100644 index 0000000000000..34b7ed7fbbd31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-topic-compaction.md @@ -0,0 +1,37 @@ +--- +id: concepts-topic-compaction +title: Topic Compaction +sidebar_label: "Topic Compaction" +original_id: concepts-topic-compaction +--- + +Pulsar was built with highly scalable [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data as a primary objective. Pulsar topics enable you to persistently store as many unacknowledged messages as you need while preserving message ordering. By default, Pulsar stores *all* unacknowledged/unprocessed messages produced on a topic. Accumulating many unacknowledged messages on a topic is necessary for many Pulsar use cases but it can also be very time intensive for Pulsar consumers to "rewind" through the entire log of messages. + +> For a more practical guide to topic compaction, see the [Topic compaction cookbook](cookbooks-compaction.md). + +For some use cases consumers don't need a complete "image" of the topic log. They may only need a few values to construct a more "shallow" image of the log, perhaps even just the most recent value. For these kinds of use cases Pulsar offers **topic compaction**. When you run compaction on a topic, Pulsar goes through a topic's backlog and removes messages that are *obscured* by later messages, i.e. it goes through the topic on a per-key basis and leaves only the most recent message associated with that key. + +Pulsar's topic compaction feature: + +* Allows for faster "rewind" through topic logs +* Applies only to [persistent topics](concepts-architecture-overview.md#persistent-storage) +* Triggered automatically when the backlog reaches a certain size or can be triggered manually via the command line. See the [Topic compaction cookbook](cookbooks-compaction.md) +* Is conceptually and operationally distinct from [retention and expiry](concepts-messaging.md#message-retention-and-expiry). Topic compaction *does*, however, respect retention. If retention has removed a message from the message backlog of a topic, the message will also not be readable from the compacted topic ledger. + +> #### Topic compaction example: the stock ticker +> An example use case for a compacted Pulsar topic would be a stock ticker topic. On a stock ticker topic, each message bears a timestamped dollar value for stocks for purchase (with the message key holding the stock symbol, e.g. `AAPL` or `GOOG`). With a stock ticker you may care only about the most recent value(s) of the stock and have no interest in historical data (i.e. you don't need to construct a complete image of the topic's sequence of messages per key). Compaction would be highly beneficial in this case because it would keep consumers from needing to rewind through obscured messages. + + +## How topic compaction works + +When topic compaction is triggered [via the CLI](cookbooks-compaction.md), Pulsar will iterate over the entire topic from beginning to end. For each key that it encounters the compaction routine will keep a record of the latest occurrence of that key. + +After that, the broker will create a new [BookKeeper ledger](concepts-architecture-overview.md#ledgers) and make a second iteration through each message on the topic. For each message, if the key matches the latest occurrence of that key, then the key's data payload, message ID, and metadata will be written to the newly created ledger. If the key doesn't match the latest then the message will be skipped and left alone. If any given message has an empty payload, it will be skipped and considered deleted (akin to the concept of [tombstones](https://en.wikipedia.org/wiki/Tombstone_(data_store)) in key-value databases). At the end of this second iteration through the topic, the newly created BookKeeper ledger is closed and two things are written to the topic's metadata: the ID of the BookKeeper ledger and the message ID of the last compacted message (this is known as the **compaction horizon** of the topic). Once this metadata is written compaction is complete. + +After the initial compaction operation, the Pulsar [broker](reference-terminology.md#broker) that owns the topic is notified whenever any future changes are made to the compaction horizon and compacted backlog. When such changes occur: + +* Clients (consumers and readers) that have read compacted enabled will attempt to read messages from a topic and either: + * Read from the topic like normal (if the message ID is greater than or equal to the compaction horizon) or + * Read beginning at the compaction horizon (if the message ID is lower than the compaction horizon) + + diff --git a/site2/website/versioned_docs/version-2.9.x/concepts-transactions.md b/site2/website/versioned_docs/version-2.9.x/concepts-transactions.md new file mode 100644 index 0000000000000..08490ba06b5d7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/concepts-transactions.md @@ -0,0 +1,30 @@ +--- +id: transactions +title: Transactions +sidebar_label: "Overview" +original_id: transactions +--- + +Transactional semantics enable event streaming applications to consume, process, and produce messages in one atomic operation. In Pulsar, a producer or consumer can work with messages across multiple topics and partitions and ensure those messages are processed as a single unit. + +The following concepts help you understand Pulsar transactions. + +## Transaction coordinator and transaction log +The transaction coordinator maintains the topics and subscriptions that interact in a transaction. When a transaction is committed, the transaction coordinator interacts with the topic owner broker to complete the transaction. + +The transaction coordinator maintains the entire life cycle of transactions, and prevents a transaction from incorrect status. + +The transaction coordinator handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +All the transaction metadata is persisted in the transaction log. The transaction log is backed by a Pulsar topic. After the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +## Transaction ID +The transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +## Transaction buffer +Messages produced within a transaction are stored in the transaction buffer. The messages in transaction buffer are not materialized (visible) to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +## Pending acknowledge state +Message acknowledges within a transaction are maintained by the pending acknowledge state before the transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log. The pending acknowledge log is backed by a Pulsar topic. A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-bookkeepermetadata.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-bookkeepermetadata.md new file mode 100644 index 0000000000000..b0fa98dc3b65d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-bookkeepermetadata.md @@ -0,0 +1,21 @@ +--- +id: cookbooks-bookkeepermetadata +title: BookKeeper Ledger Metadata +original_id: cookbooks-bookkeepermetadata +--- + +Pulsar stores data on BookKeeper ledgers, you can understand the contents of a ledger by inspecting the metadata attached to the ledger. +Such metadata are stored on ZooKeeper and they are readable using BookKeeper APIs. + +Description of current metadata: + +| Scope | Metadata name | Metadata value | +| ------------- | ------------- | ------------- | +| All ledgers | application | 'pulsar' | +| All ledgers | component | 'managed-ledger', 'schema', 'compacted-topic' | +| Managed ledgers | pulsar/managed-ledger | name of the ledger | +| Cursor | pulsar/cursor | name of the cursor | +| Compacted topic | pulsar/compactedTopic | name of the original topic | +| Compacted topic | pulsar/compactedTo | id of the last compacted message | + + diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-compaction.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-compaction.md new file mode 100644 index 0000000000000..dfa314727241a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-compaction.md @@ -0,0 +1,142 @@ +--- +id: cookbooks-compaction +title: Topic compaction +sidebar_label: "Topic compaction" +original_id: cookbooks-compaction +--- + +Pulsar's [topic compaction](concepts-topic-compaction.md#compaction) feature enables you to create **compacted** topics in which older, "obscured" entries are pruned from the topic, allowing for faster reads through the topic's history (which messages are deemed obscured/outdated/irrelevant will depend on your use case). + +To use compaction: + +* You need to give messages keys, as topic compaction in Pulsar takes place on a *per-key basis* (i.e. messages are compacted based on their key). For a stock ticker use case, the stock symbol---e.g. `AAPL` or `GOOG`---could serve as the key (more on this [below](#when-should-i-use-compacted-topics)). Messages without keys will be left alone by the compaction process. +* Compaction can be configured to run [automatically](#configuring-compaction-to-run-automatically), or you can manually [trigger](#triggering-compaction-manually) compaction using the Pulsar administrative API. +* Your consumers must be [configured](#consumer-configuration) to read from compacted topics ([Java consumers](#java), for example, have a `readCompacted` setting that must be set to `true`). If this configuration is not set, consumers will still be able to read from the non-compacted topic. + + +> Compaction only works on messages that have keys (as in the stock ticker example the stock symbol serves as the key for each message). Keys can thus be thought of as the axis along which compaction is applied. Messages that don't have keys are simply ignored by compaction. + +## When should I use compacted topics? + +The classic example of a topic that could benefit from compaction would be a stock ticker topic through which consumers can access up-to-date values for specific stocks. Imagine a scenario in which messages carrying stock value data use the stock symbol as the key (`GOOG`, `AAPL`, `TWTR`, etc.). Compacting this topic would give consumers on the topic two options: + +* They can read from the "original," non-compacted topic in case they need access to "historical" values, i.e. the entirety of the topic's messages. +* They can read from the compacted topic if they only want to see the most up-to-date messages. + +Thus, if you're using a Pulsar topic called `stock-values`, some consumers could have access to all messages in the topic (perhaps because they're performing some kind of number crunching of all values in the last hour) while the consumers used to power the real-time stock ticker only see the compacted topic (and thus aren't forced to process outdated messages). Which variant of the topic any given consumer pulls messages from is determined by the consumer's [configuration](#consumer-configuration). + +> One of the benefits of compaction in Pulsar is that you aren't forced to choose between compacted and non-compacted topics, as the compaction process leaves the original topic as-is and essentially adds an alternate topic. In other words, you can run compaction on a topic and consumers that need access to the non-compacted version of the topic will not be adversely affected. + + +## Configuring compaction to run automatically + +Tenant administrators can configure a policy for compaction at the namespace level. The policy specifies how large the topic backlog can grow before compaction is triggered. + +For example, to trigger compaction when the backlog reaches 100MB: + +```bash + +$ bin/pulsar-admin namespaces set-compaction-threshold \ + --threshold 100M my-tenant/my-namespace + +``` + +Configuring the compaction threshold on a namespace will apply to all topics within that namespace. + +## Triggering compaction manually + +In order to run compaction on a topic, you need to use the [`topics compact`](reference-pulsar-admin.md#topics-compact) command for the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. Here's an example: + +```bash + +$ bin/pulsar-admin topics compact \ + persistent://my-tenant/my-namespace/my-topic + +``` + +The `pulsar-admin` tool runs compaction via the Pulsar {@inject: rest:REST:/} API. To run compaction in its own dedicated process, i.e. *not* through the REST API, you can use the [`pulsar compact-topic`](reference-cli-tools.md#pulsar-compact-topic) command. Here's an example: + +```bash + +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant-namespace/my-topic + +``` + +> Running compaction in its own process is recommended when you want to avoid interfering with the broker's performance. Broker performance should only be affected, however, when running compaction on topics with a large keyspace (i.e when there are many keys on the topic). The first phase of the compaction process keeps a copy of each key in the topic, which can create memory pressure as the number of keys grows. Using the `pulsar-admin topics compact` command to run compaction through the REST API should present no issues in the overwhelming majority of cases; using `pulsar compact-topic` should correspondingly be considered an edge case. + +The `pulsar compact-topic` command communicates with [ZooKeeper](https://zookeeper.apache.org) directly. In order to establish communication with ZooKeeper, though, the `pulsar` CLI tool will need to have a valid [broker configuration](reference-configuration.md#broker). You can either supply a proper configuration in `conf/broker.conf` or specify a non-default location for the configuration: + +```bash + +$ bin/pulsar compact-topic \ + --broker-conf /path/to/broker.conf \ + --topic persistent://my-tenant/my-namespace/my-topic + +# If the configuration is in conf/broker.conf +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant/my-namespace/my-topic + +``` + +#### When should I trigger compaction? + +How often you [trigger compaction](#triggering-compaction-manually) will vary widely based on the use case. If you want a compacted topic to be extremely speedy on read, then you should run compaction fairly frequently. + +## Consumer configuration + +Pulsar consumers and readers need to be configured to read from compacted topics. The sections below show you how to enable compacted topic reads for Pulsar's language clients. + +### Java + +In order to read from a compacted topic using a Java consumer, the `readCompacted` parameter must be set to `true`. Here's an example consumer for a compacted topic: + +```java + +Consumer compactedTopicConsumer = client.newConsumer() + .topic("some-compacted-topic") + .readCompacted(true) + .subscribe(); + +``` + +As mentioned above, topic compaction in Pulsar works on a *per-key basis*. That means that messages that you produce on compacted topics need to have keys (the content of the key will depend on your use case). Messages that don't have keys will be ignored by the compaction process. Here's an example Pulsar message with a key: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +``` + +The example below shows a message with a key being produced on a compacted Pulsar topic: + +```java + +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer compactedTopicProducer = client.newProducer() + .topic("some-compacted-topic") + .create(); + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +compactedTopicProducer.send(msg); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-deduplication.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-deduplication.md new file mode 100644 index 0000000000000..f7f9e3d7bb425 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-deduplication.md @@ -0,0 +1,151 @@ +--- +id: cookbooks-deduplication +title: Message deduplication +sidebar_label: "Message deduplication" +original_id: cookbooks-deduplication +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +When **Message deduplication** is enabled, it ensures that each message produced on Pulsar topics is persisted to disk *only once*, even if the message is produced more than once. Message deduplication is handled automatically on the server side. + +To use message deduplication in Pulsar, you need to configure your Pulsar brokers and clients. + +## How it works + +You can enable or disable message deduplication at the namespace level or the topic level. By default, it is disabled on all namespaces or topics. You can enable it in the following ways: + +* Enable deduplication for all namespaces/topics at the broker-level. +* Enable deduplication for a specific namespace with the `pulsar-admin namespaces` interface. +* Enable deduplication for a specific topic with the `pulsar-admin topics` interface. + +## Configure message deduplication + +You can configure message deduplication in Pulsar using the [`broker.conf`](reference-configuration.md#broker) configuration file. The following deduplication-related parameters are available. + +Parameter | Description | Default +:---------|:------------|:------- +`brokerDeduplicationEnabled` | Sets the default behavior for message deduplication in the Pulsar broker. If it is set to `true`, message deduplication is enabled on all namespaces/topics. If it is set to `false`, you have to enable or disable deduplication at the namespace level or the topic level. | `false` +`brokerDeduplicationMaxNumberOfProducers` | The maximum number of producers for which information is stored for deduplication purposes. | `10000` +`brokerDeduplicationEntriesInterval` | The number of entries after which a deduplication informational snapshot is taken. A larger interval leads to fewer snapshots being taken, though this lengthens the topic recovery time (the time required for entries published after the snapshot to be replayed). | `1000` +`brokerDeduplicationSnapshotIntervalSeconds`| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |`120` +`brokerDeduplicationProducerInactivityTimeoutMinutes` | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | `360` (6 hours) + +### Set default value at the broker-level + +By default, message deduplication is *disabled* on all Pulsar namespaces/topics. To enable it on all namespaces/topics, set the `brokerDeduplicationEnabled` parameter to `true` and re-start the broker. + +Even if you set the value for `brokerDeduplicationEnabled`, enabling or disabling via Pulsar admin CLI overrides the default settings at the broker-level. + +### Enable message deduplication + +Though message deduplication is disabled by default at the broker level, you can enable message deduplication for a specific namespace or topic using the [`pulsar-admin namespaces set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. You can use the `--enable`/`-e` flag and specify the namespace/topic. + +The following example shows how to enable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --enable # or just -e + +``` + +### Disable message deduplication + +Even if you enable message deduplication at the broker level, you can disable message deduplication for a specific namespace or topic using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) or the [`pulsar-admin topics set-deduplication`](reference-pulsar-admin.md#topic-set-deduplication) command. Use the `--disable`/`-d` flag and specify the namespace/topic. + +The following example shows how to disable message deduplication at the namespace level. + +```bash + +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --disable # or just -d + +``` + +## Pulsar clients + +If you enable message deduplication in Pulsar brokers, you need complete the following tasks for your client producers: + +1. Specify a name for the producer. +1. Set the message timeout to `0` (namely, no timeout). + +The instructions for Java, Python, and C++ clients are different. + +````mdx-code-block + + + +To enable message deduplication on a [Java producer](client-libraries-java.md#producers), set the producer name using the `producerName` setter, and set the timeout to `0` using the `sendTimeout` setter. + +```java + +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; +import java.util.concurrent.TimeUnit; + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +Producer producer = pulsarClient.newProducer() + .producerName("producer-1") + .topic("persistent://public/default/topic-1") + .sendTimeout(0, TimeUnit.SECONDS) + .create(); + +``` + + + + +To enable message deduplication on a [Python producer](client-libraries-python.md#producers), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```python + +import pulsar + +client = pulsar.Client("pulsar://localhost:6650") +producer = client.create_producer( + "persistent://public/default/topic-1", + producer_name="producer-1", + send_timeout_millis=0) + +``` + + + + +To enable message deduplication on a [C++ producer](client-libraries-cpp.md#producer), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://some-tenant/ns1/topic-1"; +std::string producerName = "producer-1"; + +Client client(serviceUrl); + +ProducerConfiguration producerConfig; +producerConfig.setSendTimeout(0); +producerConfig.setProducerName(producerName); + +Producer producer; + +Result result = client.createProducer(topic, producerConfig, producer); + +``` + + + + +```` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-encryption.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-encryption.md new file mode 100644 index 0000000000000..f0d8fb8735eb6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-encryption.md @@ -0,0 +1,184 @@ +--- +id: cookbooks-encryption +title: Pulsar Encryption +sidebar_label: "Encryption" +original_id: cookbooks-encryption +--- + +Pulsar encryption allows applications to encrypt messages at the producer and decrypt at the consumer. Encryption is performed using the public/private key pair configured by the application. Encrypted messages can only be decrypted by consumers with a valid key. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). The AES key(data key) is encrypted using application provided ECDSA/RSA key pair, as a result there is no need to share the secret with everyone. + +Key is a public/private key pair used for encryption/decryption. The producer key is the public key, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. This key is used to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key(in this case the consumer) will be able to decrypt the data key which is used to decrypt the message. + +A message can be encrypted with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message + +Pulsar does not store the encryption key anywhere in the pulsar service. If you lose/delete the private key, your message is irretrievably lost, and is unrecoverable + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Here are the steps to get started: + +1. Create your ECDSA or RSA public/private key pair. + +```shell + +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem + +``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which will be invoked by Pulsar client to load the key. +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key") +5. Add CryptoKeyReader implementation to producer/consumer config: conf.setCryptoKeyReader(keyReader) +6. Sample producer application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-tenant/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); + +``` + +7. Sample Consumer Application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-tenant//my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); + +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after a certain number of messages are published. The asymmetric public key is automatically fetched by producer every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enabling encryption at the producer application: +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. This can be done in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys +1. You grant access to one of the private keys from the pairs used by producer + +In some cases, the producer may want to encrypt the messages with multiple keys. For this, add all such keys to the config. Consumer will be able to decrypt the message, as long as it has access to at least one of the keys. + +E.g: If messages needs to be encrypted using 2 keys myapp.messagekey1 and myapp.messagekey2, + +```java + +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypting encrypted messages at the consumer application: +Consumers require access one of the private keys to decrypt messages produced by the producer. If you would like to receive encrypted messages, create a public/private key and give your public key to the producer application to encrypt messages using your public key. + +## Handling Failures: +* Producer/ Consumer loses access to the key + * Producer action will fail indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption failed due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. +Application will never be able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contain batch messages, client will not be able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application will notice backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip/discard backlogged messages. + diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-message-queue.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-message-queue.md new file mode 100644 index 0000000000000..eb43cbde5fb81 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-message-queue.md @@ -0,0 +1,127 @@ +--- +id: cookbooks-message-queue +title: Using Pulsar as a message queue +sidebar_label: "Message queue" +original_id: cookbooks-message-queue +--- + +Message queues are essential components of many large-scale data architectures. If every single work object that passes through your system absolutely *must* be processed in spite of the slowness or downright failure of this or that system component, there's a good chance that you'll need a message queue to step in and ensure that unprocessed data is retained---with correct ordering---until the required actions are taken. + +Pulsar is a great choice for a message queue because: + +* it was built with [persistent message storage](concepts-architecture-overview.md#persistent-storage) in mind +* it offers automatic load balancing across [consumers](reference-terminology.md#consumer) for messages on a topic (or custom load balancing if you wish) + +> You can use the same Pulsar installation to act as a real-time message bus and as a message queue if you wish (or just one or the other). You can set aside some topics for real-time purposes and other topics for message queue purposes (or use specific namespaces for either purpose if you wish). + + +# Client configuration changes + +To use a Pulsar [topic](reference-terminology.md#topic) as a message queue, you should distribute the receiver load on that topic across several consumers (the optimal number of consumers will depend on the load). Each consumer must: + +* Establish a [shared subscription](concepts-messaging.md#shared) and use the same subscription name as the other consumers (otherwise the subscription is not shared and the consumers can't act as a processing ensemble) +* If you'd like to have tight control over message dispatching across consumers, set the consumers' **receiver queue** size very low (potentially even to 0 if necessary). Each Pulsar [consumer](reference-terminology.md#consumer) has a receiver queue that determines how many messages the consumer will attempt to fetch at a time. A receiver queue of 1000 (the default), for example, means that the consumer will attempt to process 1000 messages from the topic's backlog upon connection. Setting the receiver queue to zero essentially means ensuring that each consumer is only doing one thing at a time. + + The downside to restricting the receiver queue size of consumers is that that limits the potential throughput of those consumers and cannot be used with [partitioned topics](reference-terminology.md#partitioned-topic). Whether the performance/control trade-off is worthwhile will depend on your use case. + +## Java clients + +Here's an example Java consumer configuration that uses a shared subscription: + +```java + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; +import org.apache.pulsar.client.api.SubscriptionType; + +String SERVICE_URL = "pulsar://localhost:6650"; +String TOPIC = "persistent://public/default/mq-topic-1"; +String subscription = "sub-1"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl(SERVICE_URL) + .build(); + +Consumer consumer = client.newConsumer() + .topic(TOPIC) + .subscriptionName(subscription) + .subscriptionType(SubscriptionType.Shared) + // If you'd like to restrict the receiver queue size + .receiverQueueSize(10) + .subscribe(); + +``` + +## Python clients + +Here's an example Python consumer configuration that uses a shared subscription: + +```python + +from pulsar import Client, ConsumerType + +SERVICE_URL = "pulsar://localhost:6650" +TOPIC = "persistent://public/default/mq-topic-1" +SUBSCRIPTION = "sub-1" + +client = Client(SERVICE_URL) +consumer = client.subscribe( + TOPIC, + SUBSCRIPTION, + # If you'd like to restrict the receiver queue size + receiver_queue_size=10, + consumer_type=ConsumerType.Shared) + +``` + +## C++ clients + +Here's an example C++ consumer configuration that uses a shared subscription: + +```cpp + +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://public/defaultmq-topic-1"; +std::string subscription = "sub-1"; + +Client client(serviceUrl); + +ConsumerConfiguration consumerConfig; +consumerConfig.setConsumerType(ConsumerType.ConsumerShared); +// If you'd like to restrict the receiver queue size +consumerConfig.setReceiverQueueSize(10); + +Consumer consumer; + +Result result = client.subscribe(topic, subscription, consumerConfig, consumer); + +``` + +## Go clients + +Here is an example of a Go consumer configuration that uses a shared subscription: + +```go + +import "github.com/apache/pulsar-client-go/pulsar" + +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", +}) +if err != nil { + log.Fatal(err) +} +consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "persistent://public/default/mq-topic-1", + SubscriptionName: "sub-1", + Type: pulsar.Shared, + ReceiverQueueSize: 10, // If you'd like to restrict the receiver queue size +}) +if err != nil { + log.Fatal(err) +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-non-persistent.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-non-persistent.md new file mode 100644 index 0000000000000..178301e86eb8d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-non-persistent.md @@ -0,0 +1,63 @@ +--- +id: cookbooks-non-persistent +title: Non-persistent messaging +sidebar_label: "Non-persistent messaging" +original_id: cookbooks-non-persistent +--- + +**Non-persistent topics** are Pulsar topics in which message data is *never* [persistently stored](concepts-architecture-overview.md#persistent-storage) and kept only in memory. This cookbook provides: + +* A basic [conceptual overview](#overview) of non-persistent topics +* Information about [configurable parameters](#configuration) related to non-persistent topics +* A guide to the [CLI interface](#cli) for managing non-persistent topics + +## Overview + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar [broker](reference-terminology.md#broker) or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http + +non-persistent://tenant/namespace/topic + +``` + +> For more high-level information about non-persistent topics, see the [Concepts and Architecture](concepts-messaging.md#non-persistent-topics) documentation. + +## Using + +> In order to use non-persistent topics, they must be [enabled](#enabling) in your Pulsar broker configuration. + +In order to use non-persistent topics, you only need to differentiate them by name when interacting with them. This [`pulsar-client produce`](reference-cli-tools.md#pulsar-client-produce) command, for example, would produce one message on a non-persistent topic in a standalone cluster: + +```bash + +$ bin/pulsar-client produce non-persistent://public/default/example-np-topic \ + --num-produce 1 \ + --messages "This message will be stored only in memory" + +``` + +> For a more thorough guide to non-persistent topics from an administrative perspective, see the [Non-persistent topics](admin-api-topics.md) guide. + +## Enabling + +In order to enable non-persistent topics in a Pulsar broker, the [`enableNonPersistentTopics`](reference-configuration.md#broker-enableNonPersistentTopics) must be set to `true`. This is the default, and so you won't need to take any action to enable non-persistent messaging. + + +> #### Configuration for standalone mode +> If you're running Pulsar in standalone mode, the same configurable parameters are available but in the [`standalone.conf`](reference-configuration.md#standalone) configuration file. + +If you'd like to enable *only* non-persistent topics in a broker, you can set the [`enablePersistentTopics`](reference-configuration.md#broker-enablePersistentTopics) parameter to `false` and the `enableNonPersistentTopics` parameter to `true`. + +## Managing with cli + +Non-persistent topics can be managed using the [`pulsar-admin non-persistent`](reference-pulsar-admin.md#non-persistent) command-line interface. With that interface you can perform actions like [create a partitioned non-persistent topic](reference-pulsar-admin.md#non-persistent-create-partitioned-topic), get [stats](reference-pulsar-admin.md#non-persistent-stats) for a non-persistent topic, [list](reference-pulsar-admin.md) non-persistent topics under a namespace, and more. + +## Using with Pulsar clients + +You shouldn't need to make any changes to your Pulsar clients to use non-persistent messaging beyond making sure that you use proper [topic names](#using) with `non-persistent` as the topic type. + diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-partitioned.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-partitioned.md new file mode 100644 index 0000000000000..fb9ac354cc6d6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-partitioned.md @@ -0,0 +1,7 @@ +--- +id: cookbooks-partitioned +title: Partitioned topics +sidebar_label: "Partitioned Topics" +original_id: cookbooks-partitioned +--- +For details of the content, refer to [manage topics](admin-api-topics.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-retention-expiry.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-retention-expiry.md new file mode 100644 index 0000000000000..c8c46b3caa1be --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-retention-expiry.md @@ -0,0 +1,498 @@ +--- +id: cookbooks-retention-expiry +title: Message retention and expiry +sidebar_label: "Message retention and expiry" +original_id: cookbooks-retention-expiry +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar brokers are responsible for handling messages that pass through Pulsar, including [persistent storage](concepts-architecture-overview.md#persistent-storage) of messages. By default, for each topic, brokers only retain messages that are in at least one backlog. A backlog is the set of unacknowledged messages for a particular subscription. As a topic can have multiple subscriptions, a topic can have multiple backlogs. + +As a consequence, no messages are retained (by default) on a topic that has not had any subscriptions created for it. + +(Note that messages that are no longer being stored are not necessarily immediately deleted, and may in fact still be accessible until the next ledger rollover. Because clients cannot predict when rollovers may happen, it is not wise to rely on a rollover not happening at an inconvenient point in time.) + +In Pulsar, you can modify this behavior, with namespace granularity, in two ways: + +* You can persistently store messages that are not within a backlog (because they've been acknowledged by on every existing subscription, or because there are no subscriptions) by setting [retention policies](#retention-policies). +* Messages that are not acknowledged within a specified timeframe can be automatically acknowledged, by specifying the [time to live](#time-to-live-ttl) (TTL). + +Pulsar's [admin interface](admin-api-overview.md) enables you to manage both retention policies and TTL with namespace granularity (and thus within a specific tenant and either on a specific cluster or in the [`global`](concepts-architecture-overview.md#global-cluster) cluster). + + +> #### Retention and TTL solve two different problems +> * Message retention: Keep the data for at least X hours (even if acknowledged) +> * Time-to-live: Discard data after some time (by automatically acknowledging) +> +> Most applications will want to use at most one of these. + + +## Retention policies + +By default, when a Pulsar message arrives at a broker, the message is stored until it has been acknowledged on all subscriptions, at which point it is marked for deletion. You can override this behavior and retain messages that have already been acknowledged on all subscriptions by setting a *retention policy* for all topics in a given namespace. Retention is based on both a *size limit* and a *time limit*. + +Retention policies are useful when you use the Reader interface. The Reader interface does not use acknowledgements, and messages do not exist within backlogs. It is required to configure retention for Reader-only use cases. + +When you set a retention policy on topics in a namespace, you must set **both** a *size limit* and a *time limit*. You can refer to the following table to set retention policies in `pulsar-admin` and Java. + +|Time limit|Size limit| Message retention | +|----------|----------|------------------------| +| -1 | -1 | Infinite retention | +| -1 | >0 | Based on the size limit | +| >0 | -1 | Based on the time limit | +| 0 | 0 | Disable message retention (by default) | +| 0 | >0 | Invalid | +| >0 | 0 | Invalid | +| >0 | >0 | Acknowledged messages or messages with no active subscription will not be retained when either time or size reaches the limit. | + +The retention settings apply to all messages on topics that do not have any subscriptions, or to messages that have been acknowledged by all subscriptions. The retention policy settings do not affect unacknowledged messages on topics with subscriptions. The unacknowledged messages are controlled by the backlog quota. + +When a retention limit on a topic is exceeded, the oldest message is marked for deletion until the set of retained messages falls within the specified limits again. + +### Defaults + +You can set message retention at instance level with the following two parameters: `defaultRetentionTimeInMinutes` and `defaultRetentionSizeInMB`. Both parameters are set to `0` by default. + +For more information of the two parameters, refer to the [`broker.conf`](reference-configuration.md#broker) configuration file. + +### Set retention policy + +You can set a retention policy for a namespace by specifying the namespace, a size limit and a time limit in `pulsar-admin`, REST API and Java. + +````mdx-code-block + + + +You can use the [`set-retention`](reference-pulsar-admin.md#namespaces-set-retention) subcommand and specify a namespace, a size limit using the `-s`/`--size` flag, and a time limit using the `-t`/`--time` flag. + +In the following example, the size limit is set to 10 GB and the time limit is set to 3 hours for each topic within the `my-tenant/my-ns` namespace. +- When the size of messages reaches 10 GB on a topic within 3 hours, the acknowledged messages will not be retained. +- After 3 hours, even if the message size is less than 10 GB, the acknowledged messages will not be retained. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 10G \ + --time 3h + +``` + +In the following example, the time is not limited and the size limit is set to 1 TB. The size limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 1T \ + --time -1 + +``` + +In the following example, the size is not limited and the time limit is set to 3 hours. The time limit determines the retention. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time 3h + +``` + +To achieve infinite retention, set both values to `-1`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time -1 + +``` + +To disable the retention policy, set both values to `0`. + +```shell + +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 0 \ + --time 0 + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention?version=@pulsar:version_number@} + +:::note + +To disable the retention policy, you need to set both the size and time limit to `0`. Set either size or time limit to `0` is invalid. + +::: + + + + +```java + +int retentionTime = 10; // 10 minutes +int retentionSize = 500; // 500 megabytes +RetentionPolicies policies = new RetentionPolicies(retentionTime, retentionSize); +admin.namespaces().setRetention(namespace, policies); + +``` + + + + +```` + +### Get retention policy + +You can fetch the retention policy for a namespace by specifying the namespace. The output will be a JSON object with two keys: `retentionTimeInMinutes` and `retentionSizeInMB`. + +````mdx-code-block + + + +Use the [`get-retention`](reference-pulsar-admin.md#namespaces) subcommand and specify the namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-retention my-tenant/my-ns +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 500 +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getRetention(namespace); + +``` + + + + +```` + +## Backlog quotas + +*Backlogs* are sets of unacknowledged messages for a topic that have been stored by bookies. Pulsar stores all unacknowledged messages in backlogs until they are processed and acknowledged. + +You can control the allowable size of backlogs, at the namespace level, using *backlog quotas*. Setting a backlog quota involves setting: + +TODO: Expand on is this per backlog or per topic? + +* an allowable *size threshold* for each topic in the namespace +* a *retention policy* that determines which action the [broker](reference-terminology.md#broker) takes if the threshold is exceeded. + +The following retention policies are available: + +Policy | Action +:------|:------ +`producer_request_hold` | The broker will hold and not persist produce request payload +`producer_exception` | The broker will disconnect from the client by throwing an exception +`consumer_backlog_eviction` | The broker will begin discarding backlog messages + + +> #### Beware the distinction between retention policy types +> As you may have noticed, there are two definitions of the term "retention policy" in Pulsar, one that applies to persistent storage of messages not in backlogs, and one that applies to messages within backlogs. + + +Backlog quotas are handled at the namespace level. They can be managed via: + +### Set size/time thresholds and backlog retention policies + +You can set a size and/or time threshold and backlog retention policy for all of the topics in a [namespace](reference-terminology.md#namespace) by specifying the namespace, a size limit and/or a time limit in second, and a policy by name. + +````mdx-code-block + + + +Use the [`set-backlog-quota`](reference-pulsar-admin.md#namespaces) subcommand and specify a namespace, a size limit using the `-l`/`--limit` , `-lt`/`--limitTime` flag to limit backlog, a retention policy using the `-p`/`--policy` flag and a policy type using `-t`/`--type` (default is destination_storage). + +##### Example + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ + --limit 2G \ + --policy producer_request_hold + +``` + +```shell + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +long sizeLimit = 2147483648L; +BacklogQuota.RetentionPolicy policy = BacklogQuota.RetentionPolicy.producer_request_hold; +BacklogQuota quota = new BacklogQuota(sizeLimit, policy); +admin.namespaces().setBacklogQuota(namespace, quota); + +``` + + + + +```` + +### Get backlog threshold and backlog retention policy + +You can see which size threshold and backlog retention policy has been applied to a namespace. + +````mdx-code-block + + + +Use the [`get-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-backlog-quotas) subcommand and specify a namespace. Here's an example: + +```shell + +$ pulsar-admin namespaces get-backlog-quotas my-tenant/my-ns +{ + "destination_storage": { + "limit" : 2147483648, + "policy" : "producer_request_hold" + } +} + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap?version=@pulsar:version_number@} + + + + +```java + +Map quotas = + admin.namespaces().getBacklogQuotas(namespace); + +``` + + + + +```` + +### Remove backlog quotas + +````mdx-code-block + + + +Use the [`remove-backlog-quota`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-backlog-quota) subcommand and specify a namespace, use `t`/`--type` to specify backlog type to remove(default is destination_storage). Here's an example: + +```shell + +$ pulsar-admin namespaces remove-backlog-quota my-tenant/my-ns + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeBacklogQuota(namespace); + +``` + + + + +```` + +### Clear backlog + +#### pulsar-admin + +Use the [`clear-backlog`](reference-pulsar-admin.md#pulsar-admin-namespaces-clear-backlog) subcommand. + +##### Example + +```shell + +$ pulsar-admin namespaces clear-backlog my-tenant/my-ns + +``` + +By default, you will be prompted to ensure that you really want to clear the backlog for the namespace. You can override the prompt using the `-f`/`--force` flag. + +## Time to live (TTL) + +By default, Pulsar stores all unacknowledged messages forever. This can lead to heavy disk space usage in cases where a lot of messages are going unacknowledged. If disk space is a concern, you can set a time to live (TTL) that determines how long unacknowledged messages will be retained. + +### Set the TTL for a namespace + +````mdx-code-block + + + +Use the [`set-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-set-message-ttl) subcommand and specify a namespace and a TTL (in seconds) using the `-ttl`/`--messageTTL` flag. + +##### Example + +```shell + +$ pulsar-admin namespaces set-message-ttl my-tenant/my-ns \ + --messageTTL 120 # TTL of 2 minutes + +``` + + + + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().setNamespaceMessageTTL(namespace, ttlInSeconds); + +``` + + + + +```` + +### Get the TTL configuration for a namespace + +````mdx-code-block + + + +Use the [`get-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces get-message-ttl my-tenant/my-ns +60 + +``` + + + + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().getNamespaceMessageTTL(namespace) + +``` + + + + +```` + +### Remove the TTL configuration for a namespace + +````mdx-code-block + + + +Use the [`remove-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell + +$ pulsar-admin namespaces remove-message-ttl my-tenant/my-ns + +``` + + + + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/removeNamespaceMessageTTL?version=@pulsar:version_number@} + + + + +```java + +admin.namespaces().removeNamespaceMessageTTL(namespace) + +``` + + + + +```` + +## Delete messages from namespaces + +If you do not have any retention period and that you never have much of a backlog, the upper limit for retaining messages, which are acknowledged, equals to the Pulsar segment rollover period + entry log rollover period + (garbage collection interval * garbage collection ratios). + +- **Segment rollover period**: basically, the segment rollover period is how often a new segment is created. Once a new segment is created, the old segment will be deleted. By default, this happens either when you have written 50,000 entries (messages) or have waited 240 minutes. You can tune this in your broker. + +- **Entry log rollover period**: multiple ledgers in BookKeeper are interleaved into an [entry log](https://bookkeeper.apache.org/docs/4.11.1/getting-started/concepts/#entry-logs). In order for a ledger that has been deleted, the entry log must all be rolled over. +The entry log rollover period is configurable, but is purely based on the entry log size. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-settings). Once the entry log is rolled over, the entry log can be garbage collected. + +- **Garbage collection interval**: because entry logs have interleaved ledgers, to free up space, the entry logs need to be rewritten. The garbage collection interval is how often BookKeeper performs garbage collection. which is related to minor compaction and major compaction of entry logs. For details, see [here](https://bookkeeper.apache.org/docs/4.11.1/reference/config/#entry-log-compaction-settings). diff --git a/site2/website/versioned_docs/version-2.9.x/cookbooks-tiered-storage.md b/site2/website/versioned_docs/version-2.9.x/cookbooks-tiered-storage.md new file mode 100644 index 0000000000000..3f87de62ca8a1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/cookbooks-tiered-storage.md @@ -0,0 +1,346 @@ +--- +id: cookbooks-tiered-storage +title: Tiered Storage +sidebar_label: "Tiered Storage" +original_id: cookbooks-tiered-storage +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be offloaded to long term storage, thereby freeing up space in BookKeeper and reducing storage costs. This cookbook walks you through using tiered storage in your Pulsar cluster. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [Google Cloud Storage](https://cloud.google.com/storage/)(GCS for short) +for long term storage. With Jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystem for long term storage. +With Hadoop, it is easy to add support for more filesystem in the future. + +## When should I use Tiered Storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm you can rerun it against your full user history. + +## The offloading mechanism + +A topic in Pulsar is backed by a log, known as a managed ledger. This log is composed of an ordered list of segments. Pulsar only every writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a segment oriented architecture. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The Tiered Storage offloading mechanism takes advantage of this segment oriented architecture. When offloading is requested, the segments of the log are copied, one-by-one, to tiered storage. All segments of the log, apart from the segment currently being written to can be offloaded. + +On the broker, the administrator must configure the bucket and credentials for the cloud storage service. +The configured bucket must exist before attempting to offload. If it does not exist, the offload operation will fail. + +Pulsar uses multi-part objects to upload the segment data. It is possible that a broker could crash while uploading the data. +We recommend you add a life cycle rule your bucket to expire incomplete multi-part upload after a day or two to avoid +getting charged for incomplete uploads. + +When ledgers are offloaded to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +## Configuring the offload driver + +Offloading is configured in ```broker.conf```. + +At a minimum, the administrator must configure the driver, the bucket and the authenticating credentials. +There is also some other knobs to configure, like the bucket region, the max block size in backed storage, etc. + +Currently we support driver of types: + +- `aws-s3`: [Simple Cloud Storage Service](https://aws.amazon.com/s3/) +- `google-cloud-storage`: [Google Cloud Storage](https://cloud.google.com/storage/) +- `filesystem`: [Filesystem Storage](http://hadoop.apache.org/) + +> Driver names are case-insensitive for driver's name. There is a third driver type, `s3`, which is identical to `aws-s3`, +> though it requires that you specify an endpoint url using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if +> using a S3 compatible data store, other than AWS. + +```conf + +managedLedgerOffloadDriver=aws-s3 + +``` + +### "aws-s3" Driver configuration + +#### Bucket and Region + +Buckets are the basic containers that hold your data. +Everything that you store in Cloud Storage must be contained in a bucket. +You can use buckets to organize your data and control access to your data, +but unlike directories and folders, you cannot nest buckets. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required +but a recommended configuration. If it is not configured, It will use the default region. + +With AWS S3, the default region is `US East (N. Virginia)`. Page [AWS Regions and Endpoints](https://docs.aws.amazon.com/general/latest/gr/rande.html) contains more information. + +```conf + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication with AWS + +To be able to access AWS S3, you need to authenticate with AWS S3. +Pulsar does not provide any direct means of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, they can be configured in a number of ways. + +1. Using ec2 instance metadata credentials + +If you are on AWS instance with an instance profile that provides credentials, Pulsar will use these credentials +if no other mechanism is provided + +2. Set the environment variables **AWS_ACCESS_KEY_ID** and **AWS_SECRET_ACCESS_KEY** in ```conf/pulsar_env.sh```. + +```bash + +export AWS_ACCESS_KEY_ID=ABC123456789 +export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +> \"export\" is important so that the variables are made available in the environment of spawned processes. + + +3. Add the Java system properties *aws.accessKeyId* and *aws.secretKey* to **PULSAR_EXTRA_OPTS** in `conf/pulsar_env.sh`. + +```bash + +PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacityPerThread=4096" + +``` + +4. Set the access credentials in ```~/.aws/credentials```. + +```conf + +[default] +aws_access_key_id=ABC123456789 +aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +5. Assuming an IAM role + +If you want to assume an IAM role, this can be done via specifying the following: + +```conf + +s3ManagedLedgerOffloadRole= +s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + +``` + +This will use the `DefaultAWSCredentialsProviderChain` for assuming this role. + +> The broker must be rebooted for credentials specified in pulsar_env to take effect. + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to AWS S3. + +- ```s3ManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of + a "part" sent during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```s3ManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for + each individual read when reading back data from AWS S3. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "google-cloud-storage" Driver configuration + +Buckets are the basic containers that hold your data. Everything that you store in +Cloud Storage must be contained in a bucket. You can use buckets to organize your data and +control access to your data, but unlike directories and folders, you cannot nest buckets. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required but +a recommended configuration. If it is not configured, It will use the default region. + +Regarding GCS, buckets are default created in the `us multi-regional location`, +page [Bucket Locations](https://cloud.google.com/storage/docs/bucket-locations) contains more information. + +```conf + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication with GCS + +The administrator needs to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in `broker.conf` +for the broker to be able to access the GCS service. `gcsManagedLedgerOffloadServiceAccountKeyFile` is +a Json file, containing the GCS credentials of a service account. +[Service Accounts section of this page](https://support.google.com/googleapi/answer/6158849) contains +more information of how to create this key file for authentication. More information about google cloud IAM +is available [here](https://cloud.google.com/storage/docs/access-control/iam). + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps: + +1. Open the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). +2. Select a project or create a new one. +3. Click **Create service account**. +4. In the **Create service account** window, type a name for the service account, and select **Furnish a new private key**. If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, also select **Enable G Suite Domain-wide Delegation**. +5. Click **Create**. + +> Notes: Make ensure that the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account in [here](https://cloud.google.com/storage/docs/access-control/iam). + +```conf + +gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/hello/Downloads/project-804d5e6a6f33.json" + +``` + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to GCS. + +- ```gcsManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of a "part" sent + during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```gcsManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for each individual + read when reading back data from GCS. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "filesystem" Driver configuration + + +#### Configure connection address + +You can configure the connection address in the `broker.conf` file. + +```conf + +fileSystemURI="hdfs://127.0.0.1:9000" + +``` + +#### Configure Hadoop profile path + +The configuration file is stored in the Hadoop profile path. It contains various settings, such as base path, authentication, and so on. + +```conf + +fileSystemProfilePath="../conf/filesystem_offload_core_site.xml" + +``` + +The model for storing topic data uses `org.apache.hadoop.io.MapFile`. You can use all of the configurations in `org.apache.hadoop.io.MapFile` for Hadoop. + +**Example** + +```conf + + + fs.defaultFS + + + + + hadoop.tmp.dir + pulsar + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + +``` + +For more information about the configurations in `org.apache.hadoop.io.MapFile`, see [Filesystem Storage](http://hadoop.apache.org/). +## Configuring offload to run automatically + +Namespace policies can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that the topic has stored on the pulsar cluster. Once the topic reaches the threshold, an offload operation will be triggered. Setting a negative value to the threshold will disable automatic offloading. Setting the threshold to 0 will cause the broker to offload data as soon as it possiby can. + +```bash + +$ bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +> Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offload will not until the current segment is full. + +## Configuring read priority for offloaded messages + +By default, once messages were offloaded to long term storage, brokers will read them from long term storage, but messages still exists in bookkeeper for a period depends on the administrator's configuration. For +messages exists in both bookkeeper and long term storage, if they are preferred to read from bookkeeper, you can use command to change this configuration. + +```bash + +# default value for -orp is tiered-storage-first +$ bin/pulsar-admin namespaces set-offload-policies my-tenant/my-namespace -orp bookkeeper-first +$ bin/pulsar-admin topics set-offload-policies my-tenant/my-namespace/topic1 -orp bookkeeper-first + +``` + +## Triggering offload manually + +Offloading can manually triggered through a REST endpoint on the Pulsar broker. We provide a CLI which will call this rest endpoint for you. + +When triggering offload, you must specify the maximum size, in bytes, of backlog which will be retained locally on the bookkeeper. The offload mechanism will offload segments from the start of the topic backlog until this condition is met. + +```bash + +$ bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 +Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + +``` + +The command to triggers an offload will not wait until the offload operation has completed. To check the status of the offload, use offload-status. + +```bash + +$ bin/pulsar-admin topics offload-status my-tenant/my-namespace/topic1 +Offload is currently running + +``` + +To wait for offload to complete, add the -w flag. + +```bash + +$ bin/pulsar-admin topics offload-status -w my-tenant/my-namespace/topic1 +Offload was a success + +``` + +If there is an error offloading, the error will be propagated to the offload-status command. + +```bash + +$ bin/pulsar-admin topics offload-status persistent://public/default/topic1 +Error in offload +null + +Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + +``` + +` + diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-aws.md b/site2/website/versioned_docs/version-2.9.x/deploy-aws.md new file mode 100644 index 0000000000000..93c389b56e2cf --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-aws.md @@ -0,0 +1,271 @@ +--- +id: deploy-aws +title: Deploying a Pulsar cluster on AWS using Terraform and Ansible +sidebar_label: "Amazon Web Services" +original_id: deploy-aws +--- + +> For instructions on deploying a single Pulsar cluster manually rather than using Terraform and Ansible, see [Deploying a Pulsar cluster on bare metal](deploy-bare-metal.md). For instructions on manually deploying a multi-cluster Pulsar instance, see [Deploying a Pulsar instance on bare metal](deploy-bare-metal-multi-cluster.md). + +One of the easiest ways to get a Pulsar [cluster](reference-terminology.md#cluster) running on [Amazon Web Services](https://aws.amazon.com/) (AWS) is to use the [Terraform](https://terraform.io) infrastructure provisioning tool and the [Ansible](https://www.ansible.com) server automation tool. Terraform can create the resources necessary for running the Pulsar cluster---[EC2](https://aws.amazon.com/ec2/) instances, networking and security infrastructure, etc.---While Ansible can install and run Pulsar on the provisioned resources. + +## Requirements and setup + +In order to install a Pulsar cluster on AWS using Terraform and Ansible, you need to prepare the following things: + +* An [AWS account](https://aws.amazon.com/account/) and the [`aws`](https://aws.amazon.com/cli/) command-line tool +* Python and [pip](https://pip.pypa.io/en/stable/) +* The [`terraform-inventory`](https://github.com/adammck/terraform-inventory) tool, which enables Ansible to use Terraform artifacts + +You also need to make sure that you are currently logged into your AWS account via the `aws` tool: + +```bash + +$ aws configure + +``` + +## Installation + +You can install Ansible on Linux or macOS using pip. + +```bash + +$ pip install ansible + +``` + +You can install Terraform using the instructions [here](https://learn.hashicorp.com/tutorials/terraform/install-cli). + +You also need to have the Terraform and Ansible configuration for Pulsar locally on your machine. You can find them in the [GitHub repository](https://github.com/apache/pulsar) of Pulsar, which you can fetch using Git commands: + +```bash + +$ git clone https://github.com/apache/pulsar +$ cd pulsar/deployment/terraform-ansible/aws + +``` + +## SSH setup + +> If you already have an SSH key and want to use it, you can skip the step of generating an SSH key and update `private_key_file` setting +> in `ansible.cfg` file and `public_key_path` setting in `terraform.tfvars` file. +> +> For example, if you already have a private SSH key in `~/.ssh/pulsar_aws` and a public key in `~/.ssh/pulsar_aws.pub`, +> follow the steps below: +> +> 1. update `ansible.cfg` with following values: +> + +> ```shell +> +> private_key_file=~/.ssh/pulsar_aws +> +> +> ``` + +> +> 2. update `terraform.tfvars` with following values: +> + +> ```shell +> +> public_key_path=~/.ssh/pulsar_aws.pub +> +> +> ``` + + +In order to create the necessary AWS resources using Terraform, you need to create an SSH key. Enter the following commands to create a private SSH key in `~/.ssh/id_rsa` and a public key in `~/.ssh/id_rsa.pub`: + +```bash + +$ ssh-keygen -t rsa + +``` + +Do *not* enter a passphrase (hit **Enter** instead when the prompt comes out). Enter the following command to verify that a key has been created: + +```bash + +$ ls ~/.ssh +id_rsa id_rsa.pub + +``` + +## Create AWS resources using Terraform + +To start building AWS resources with Terraform, you need to install all Terraform dependencies. Enter the following command: + +```bash + +$ terraform init +# This will create a .terraform folder + +``` + +After that, you can apply the default Terraform configuration by entering this command: + +```bash + +$ terraform apply + +``` + +Then you see this prompt below: + +```bash + +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: + +``` + +Type `yes` and hit **Enter**. Applying the configuration could take several minutes. When the configuration applying finishes, you can see `Apply complete!` along with some other information, including the number of resources created. + +### Apply a non-default configuration + +You can apply a non-default Terraform configuration by changing the values in the `terraform.tfvars` file. The following variables are available: + +Variable name | Description | Default +:-------------|:------------|:------- +`public_key_path` | The path of the public key that you have generated. | `~/.ssh/id_rsa.pub` +`region` | The AWS region in which the Pulsar cluster runs | `us-west-2` +`availability_zone` | The AWS availability zone in which the Pulsar cluster runs | `us-west-2a` +`aws_ami` | The [Amazon Machine Image](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) (AMI) that the cluster uses | `ami-9fa343e7` +`num_zookeeper_nodes` | The number of [ZooKeeper](https://zookeeper.apache.org) nodes in the ZooKeeper cluster | 3 +`num_bookie_nodes` | The number of bookies that runs in the cluster | 3 +`num_broker_nodes` | The number of Pulsar brokers that runs in the cluster | 2 +`num_proxy_nodes` | The number of Pulsar proxies that runs in the cluster | 1 +`base_cidr_block` | The root [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) that network assets uses for the cluster | `10.0.0.0/16` +`instance_types` | The EC2 instance types to be used. This variable is a map with two keys: `zookeeper` for the ZooKeeper instances, `bookie` for the BookKeeper bookies and `broker` and `proxy` for Pulsar brokers and bookies | `t2.small` (ZooKeeper), `i3.xlarge` (BookKeeper) and `c5.2xlarge` (Brokers/Proxies) + +### What is installed + +When you run the Ansible playbook, the following AWS resources are used: + +* 9 total [Elastic Compute Cloud](https://aws.amazon.com/ec2) (EC2) instances running the [ami-9fa343e7](https://access.redhat.com/articles/3135091) Amazon Machine Image (AMI), which runs [Red Hat Enterprise Linux (RHEL) 7.4](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/7.4_release_notes/index). By default, that includes: + * 3 small VMs for ZooKeeper ([t2.small](https://www.ec2instances.info/?selected=t2.small) instances) + * 3 larger VMs for BookKeeper [bookies](reference-terminology.md#bookie) ([i3.xlarge](https://www.ec2instances.info/?selected=i3.xlarge) instances) + * 2 larger VMs for Pulsar [brokers](reference-terminology.md#broker) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) + * 1 larger VMs for Pulsar [proxy](reference-terminology.md#proxy) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) +* An EC2 [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) +* A [virtual private cloud](https://aws.amazon.com/vpc/) (VPC) for security +* An [API Gateway](https://aws.amazon.com/api-gateway/) for connections from the outside world +* A [route table](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Route_Tables.html) for the Pulsar cluster's VPC +* A [subnet](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) for the VPC + +All EC2 instances for the cluster run in the [us-west-2](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) region. + +### Fetch your Pulsar connection URL + +When you apply the Terraform configuration by entering the command `terraform apply`, Terraform outputs a value for the `pulsar_service_url`. The value should look something like this: + +``` + +pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650 + +``` + +You can fetch that value at any time by entering the command `terraform output pulsar_service_url` or parsing the `terraform.tstate` file (which is JSON, even though the filename does not reflect that): + +```bash + +$ cat terraform.tfstate | jq .modules[0].outputs.pulsar_service_url.value + +``` + +### Destroy your cluster + +At any point, you can destroy all AWS resources associated with your cluster using Terraform's `destroy` command: + +```bash + +$ terraform destroy + +``` + +## Setup Disks + +Before you run the Pulsar playbook, you need to mount the disks to the correct directories on those bookie nodes. Since different type of machines have different disk layout, you need to update the task defined in `setup-disk.yaml` file after changing the `instance_types` in your terraform config, + +To setup disks on bookie nodes, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + setup-disk.yaml + +``` + +After that, the disks is mounted under `/mnt/journal` as journal disk, and `/mnt/storage` as ledger disk. +Remember to enter this command just only once. If you attempt to enter this command again after you have run Pulsar playbook, your disks might potentially be erased again, causing the bookies to fail to start up. + +## Run the Pulsar playbook + +Once you have created the necessary AWS resources using Terraform, you can install and run Pulsar on the Terraform-created EC2 instances using Ansible. + +(Optional) If you want to use any [built-in IO connectors](io-connectors.md) , edit the `Download Pulsar IO packages` task in the `deploy-pulsar.yaml` file and uncomment the connectors you want to use. + +To run the playbook, enter this command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + ../deploy-pulsar.yaml + +``` + +If you have created a private SSH key at a location different from `~/.ssh/id_rsa`, you can specify the different location using the `--private-key` flag in the following command: + +```bash + +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + --private-key="~/.ssh/some-non-default-key" \ + ../deploy-pulsar.yaml + +``` + +## Access the cluster + +You can now access your running Pulsar using the unique Pulsar connection URL for your cluster, which you can obtain following the instructions [above](#fetching-your-pulsar-connection-url). + +For a quick demonstration of accessing the cluster, we can use the Python client for Pulsar and the Python shell. First, install the Pulsar Python module using pip: + +```bash + +$ pip install pulsar-client + +``` + +Now, open up the Python shell using the `python` command: + +```bash + +$ python + +``` + +Once you are in the shell, enter the following command: + +```python + +>>> import pulsar +>>> client = pulsar.Client('pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650') +# Make sure to use your connection URL +>>> producer = client.create_producer('persistent://public/default/test-topic') +>>> producer.send('Hello world') +>>> client.close() + +``` + +If all of these commands are successful, Pulsar clients can now use your cluster! diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal-multi-cluster.md b/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal-multi-cluster.md new file mode 100644 index 0000000000000..f25b11041c5e3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal-multi-cluster.md @@ -0,0 +1,452 @@ +--- +id: deploy-bare-metal-multi-cluster +title: Deploying a multi-cluster on bare metal +sidebar_label: "Bare metal multi-cluster" +original_id: deploy-bare-metal-multi-cluster +--- + +:::tip + +1. You can use single-cluster Pulsar installation in most use cases, such as experimenting with Pulsar or using Pulsar in a startup or in a single team. If you need to run a multi-cluster Pulsar instance, see the [guide](deploy-bare-metal-multi-cluster.md). +2. If you want to use all built-in [Pulsar IO](io-overview.md) connectors, you need to download `apache-pulsar-io-connectors`package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders`package and install `apache-pulsar-offloaders` under `offloaders` directory in the Pulsar directory on every broker node. For more details of how to configure this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +A Pulsar instance consists of multiple Pulsar clusters working in unison. You can distribute clusters across data centers or geographical regions and replicate the clusters amongst themselves using [geo-replication](administration-geo.md).Deploying a multi-cluster Pulsar instance consists of the following steps: + +1. Deploying two separate ZooKeeper quorums: a local quorum for each cluster in the instance and a configuration store quorum for instance-wide tasks +2. Initializing cluster metadata for each cluster +3. Deploying a BookKeeper cluster of bookies in each Pulsar cluster +4. Deploying brokers in each Pulsar cluster + + +> #### Run Pulsar locally or on Kubernetes? +> This guide shows you how to deploy Pulsar in production in a non-Kubernetes environment. If you want to run a standalone Pulsar cluster on a single machine for development purposes, see the [Setting up a local cluster](getting-started-standalone.md) guide. If you want to run Pulsar on [Kubernetes](https://kubernetes.io), see the [Pulsar on Kubernetes](deploy-kubernetes.md) guide, which includes sections on running Pulsar on Kubernetes, on Google Kubernetes Engine and on Amazon Web Services. + +## System requirement + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. You need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +## Install Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar @pulsar:version@ binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=pulsar/pulsar-@pulsar:version@/apache-pulsar-@pulsar:version@-bin.tar.gz' -O apache-pulsar-@pulsar:version@-bin.tar.gz + + ``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | [Command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`licenses` | License files, in `.txt` form, for various components of the Pulsar codebase + +The following directories are created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory that ZooKeeper and BookKeeper use +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs that the installation creates + + +## Deploy ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* Local ZooKeeper operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs a dedicated ZooKeeper cluster. +* Configuration Store operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +You can use an independent cluster of machines or the same machines used by local ZooKeeper to provide the configuration store quorum. + + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +You need to stand up one local ZooKeeper cluster per Pulsar cluster for deploying a Pulsar instance. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +On each host, you need to specify the ID of the node in the `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +:::tip + +See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +::: + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com` the command looks like `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start zookeeper + +``` + +### Deploy the configuration store + +The ZooKeeper cluster configured and started up in the section above is a local ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a single-cluster instance, you do not need a separate cluster for the configuration store. If, however, you deploy a multi-cluster instance, you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorum. You need to use the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 + +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions, and other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also assume, each cluster has its own local ZK servers named such as the following: + +``` + +zk[1-3].${CLUSTER}.example.com + +``` + +In this scenario if you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This method guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties + +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer + +``` + +Additionally, ZK observers need to have the following parameters: + +```properties + +peerType=observer + +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell + +$ bin/pulsar-daemon start configuration-store + +``` + +## Cluster metadata initialization + +Once you set up the cluster-specific ZooKeeper and configuration store quorums for your instance, you need to write some metadata to ZooKeeper for each cluster in your instance. **you only need to write these metadata once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ + +``` + +As you can see from the example above, you need to specify the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. + +Make sure to run `initialize-cluster-metadata` for each cluster in your instance. + +## Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configure bookies + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the local ZooKeeper of Pulsar cluster. + +### Start bookies + +You can start a bookie in two ways: in the foreground or as a background daemon. + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +You can verify that the bookie works properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```bash + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates a new ledger on the local bookie, writes a few entries, reads them back and finally deletes the ledger. + +After you have started all bookies, you can use the `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify that all bookies in the cluster are running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, having a suitable hardware configuration is essential for the bookies. The following are key dimensions for bookie hardware capacity. + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, having fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts is critical. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers acknowledge the message. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + + + +## Deploy brokers + +Once you set up ZooKeeper, initialize cluster metadata, and spin up BookKeeper bookies, you can deploy brokers. + +### Broker configuration + +You can configure brokers using the [`conf/broker.conf`](reference-configuration.md#broker) configuration file. + +The most important element of broker configuration is ensuring that each broker is aware of its local ZooKeeper quorum as well as the configuration store quorum. Make sure that you set the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) parameter to reflect the local quorum and the [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameter to reflect the configuration store quorum (although you need to specify only those ZooKeeper servers located in the same cluster). + +You also need to specify the name of the [cluster](reference-terminology.md#cluster) to which the broker belongs using the [`clusterName`](reference-configuration.md#broker-clusterName) parameter. In addition, you need to match the broker and web service ports provided when you initialize the metadata (especially when you use a different port from default) of the cluster. + +The following is an example configuration: + +```properties + +# Local ZooKeeper servers +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Configuration store quorum connection string. +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +clusterName=us-west + +# Broker data port +brokerServicePort=6650 + +# Broker data port for TLS +brokerServicePortTls=6651 + +# Port to use to server HTTP request +webServicePort=8080 + +# Port to use to server HTTPS request +webServicePortTls=8443 + +``` + +### Broker hardware + +Pulsar brokers do not require any special hardware since they do not use the local disk. You had better choose fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) so that the software can take full advantage of that. + +### Start the broker service + +You can start a broker in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell + +$ bin/pulsar-daemon start broker + +``` + +You can also start brokers in the foreground by using [`pulsar broker`](reference-cli-tools.md#broker): + +```shell + +$ bin/pulsar broker + +``` + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to communicate with an entire Pulsar instance using a single URL. + +You can use your own service discovery system. If you use your own system, you only need to satisfy just one requirement: when a client performs an HTTP request to an [endpoint](reference-configuration.md) for a Pulsar cluster, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to some active brokers in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +> **Service discovery already provided by many scheduling systems** +> Many large-scale deployment systems, such as [Kubernetes](deploy-kubernetes.md), have service discovery systems built in. If you run Pulsar on such a system, you may not need to provide your own service discovery mechanism. + +## Admin client and verification + +At this point your Pulsar instance should be ready to use. You can now configure client machines that can serve as [administrative clients](admin-api-overview.md) for each cluster. You can use the [`conf/client.conf`](reference-configuration.md#client) configuration file to configure admin clients. + +The most important thing is that you point the [`serviceUrl`](reference-configuration.md#client-serviceUrl) parameter to the correct service URL for the cluster: + +```properties + +serviceUrl=http://pulsar.us-west.example.com:8080/ + +``` + +## Provision new tenants + +Pulsar is built as a fundamentally multi-tenant system. + + +If a new tenant wants to use the system, you need to create a new one. You can create a new tenant by using the [`pulsar-admin`](reference-pulsar-admin.md#tenants) CLI tool: + +```shell + +$ bin/pulsar-admin tenants create test-tenant \ + --allowed-clusters us-west \ + --admin-roles test-admin-role + +``` + +In this command, users who identify with `test-admin-role` role can administer the configuration for the `test-tenant` tenant. The `test-tenant` tenant can only use the `us-west` cluster. From now on, this tenant can manage its resources. + +Once you create a tenant, you need to create [namespaces](reference-terminology.md#namespace) for topics within that tenant. + + +The first step is to create a namespace. A namespace is an administrative unit that can contain many topics. A common practice is to create a namespace for each different use case from a single tenant. + +```shell + +$ bin/pulsar-admin namespaces create test-tenant/ns1 + +``` + +##### Test producer and consumer + + +Everything is now ready to send and receive messages. The quickest way to test the system is through the [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) client tool. + + +You can use a topic in the namespace that you have just created. Topics are automatically created the first time when a producer or a consumer tries to use them. + +The topic name in this case could be: + +```http + +persistent://test-tenant/ns1/my-topic + +``` + +Start a consumer that creates a subscription on the topic and waits for messages: + +```shell + +$ bin/pulsar-perf consume persistent://test-tenant/ns1/my-topic + +``` + +Start a producer that publishes messages at a fixed rate and reports stats every 10 seconds: + +```shell + +$ bin/pulsar-perf produce persistent://test-tenant/ns1/my-topic + +``` + +To report the topic stats: + +```shell + +$ bin/pulsar-admin topics stats persistent://test-tenant/ns1/my-topic + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal.md b/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal.md new file mode 100644 index 0000000000000..9bb4235cece5d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-bare-metal.md @@ -0,0 +1,559 @@ +--- +id: deploy-bare-metal +title: Deploy a cluster on bare metal +sidebar_label: "Bare metal" +original_id: deploy-bare-metal +--- + +:::tip + +1. You can use single-cluster Pulsar installation in most use cases, such as experimenting with Pulsar or using Pulsar in a startup or in a single team. If you need to run a multi-cluster Pulsar instance, see the [guide](deploy-bare-metal-multi-cluster.md). +2. If you want to use all built-in [Pulsar IO](io-overview.md) connectors, you need to download `apache-pulsar-io-connectors`package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders`package and install `apache-pulsar-offloaders` under `offloaders` directory in the Pulsar directory on every broker node. For more details of how to configure this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +::: + +Deploying a Pulsar cluster consists of the following steps: + +1. Deploy a [ZooKeeper](#deploy-a-zookeeper-cluster) cluster (optional) +2. Initialize [cluster metadata](#initialize-cluster-metadata) +3. Deploy a [BookKeeper](#deploy-a-bookkeeper-cluster) cluster +4. Deploy one or more Pulsar [brokers](#deploy-pulsar-brokers) + +## Preparation + +### Requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +You can reuse existing Zookeeper clusters. + +::: + +To run Pulsar on bare metal, the following configuration is recommended: + +* At least 6 Linux machines or VMs + * 3 for running [ZooKeeper](https://zookeeper.apache.org) + * 3 for running a Pulsar broker, and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +:::note + +* Broker is only supported on 64-bit JVM. +* If you do not have enough machines, or you want to test Pulsar in cluster mode (and expand the cluster later), You can fully deploy Pulsar on a node on which ZooKeeper, bookie and broker run. +* If you do not have a DNS server, you can use the multi-host format in the service URL instead. + +::: + +Each machine in your cluster needs to have [Java 8](https://adoptium.net/?variant=openjdk8) or [Java 11](https://adoptium.net/?variant=openjdk11) installed. + +The following is a diagram showing the basic setup: + +![alt-text](/assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to communicate with the Pulsar cluster using a single URL. In this case, `pulsar-cluster.acme.com` abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +If you deploy a Pulsar cluster, keep in mind the following basic better choices when you do the capacity planning. + +#### ZooKeeper + +For machines running ZooKeeper, it is recommended to use less powerful machines or VMs. Pulsar uses ZooKeeper only for periodic coordination-related and configuration-related tasks, not for basic operations. If you run Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance might likely suffice. + +#### Bookies and Brokers + +For machines running a bookie and a Pulsar broker, more powerful machines are required. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines you can use the following: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +To start a Pulsar instance, below are the minimum and the recommended hardware settings. + +1. The minimum hardware settings (250 Pulsar topics) + - Broker + - CPU: 0.2 + - Memory: 256MB + - Bookie + - CPU: 0.2 + - Memory: 256MB + - Storage: + - Journal: 8GB, PD-SSD + - Ledger: 16GB, PD-STANDARD + +2. The recommended hardware settings (1000 Pulsar topics) + + - Broker + - CPU: 8 + - Memory: 8GB + - Bookie + - CPU: 4 + - Memory: 8GB + - Storage: + - Journal: 256GB, PD-SSD + - Ledger: 2TB, PD-STANDARD + +## Install the Pulsar binary package + +> You need to install the Pulsar binary package on each machine in the cluster, including machines running ZooKeeper and BookKeeper. + +To get started deploying a Pulsar cluster on bare metal, you need to download a binary tarball release in one of the following ways: + +* By clicking on the link below directly, which automatically triggers a download: + * Pulsar @pulsar:version@ binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) on GitHub +* Using [wget](https://www.gnu.org/software/wget): + +```bash + +$ wget pulsar:binary_release_url + +``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash + +$ tar xvzf apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +The extracted directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` |[command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory that ZooKeeper and BookKeeper use +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`logs` | Logs that the installation creates + +## [Install Builtin Connectors (optional)]( https://pulsar.apache.org/docs/en/next/standalone/#install-builtin-connectors-optional) + +> Since Pulsar release `2.1.0-incubating`, Pulsar provides a separate binary distribution, containing all the `builtin` connectors. +> To enable the `builtin` connectors (optional), you can follow the instructions below. + +To use `builtin` connectors, you need to download the connectors tarball release on every broker node in one of the following ways : + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar IO Connectors @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +Once you download the .nar file, copy the file to directory `connectors` in the pulsar directory. +For example, if you download the connector file `pulsar-io-aerospike-@pulsar:version@.nar`: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +## [Install Tiered Storage Offloaders (optional)](https://pulsar.apache.org/docs/en/next/standalone/#install-tiered-storage-offloaders-optional) + +> Since Pulsar release `2.2.0`, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> If you want to enable tiered storage feature, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To use tiered storage offloaders, you need to download the offloaders tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +Once you download the tarball, in the Pulsar directory, untar the offloaders package and copy the offloaders as `offloaders` in the Pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you can find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more details of how to configure tiered storage feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md) + + +## Deploy a ZooKeeper cluster + +> If you already have an existing zookeeper cluster and want to use it, you can skip this section. + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination-related and configuration-related tasks for Pulsar. To deploy a Pulsar cluster, you need to deploy ZooKeeper first. A 3-node ZooKeeper cluster is the recommended configuration. Pulsar does not make heavy use of ZooKeeper, so the lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory that you create [above](#install-the-pulsar-binary-package)). The following is an example: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> If you only have one machine on which to deploy Pulsar, you only need to add one server entry in the configuration file. + +On each host, you need to specify the ID of the node in the `myid` file, which is in the `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +For example, on a ZooKeeper server like `zk1.us-west.example.com`, you can set the `myid` value as follows: + +```bash + +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid + +``` + +On `zk2.us-west.example.com`, the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and have the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start zookeeper + +``` + +> If you plan to deploy Zookeeper with the Bookie on the same node, you need to start zookeeper by using different stats +> port by configuring the `metricsProvider.httpPort` in zookeeper.conf. + +## Initialize cluster metadata + +Once you deploy ZooKeeper for your cluster, you need to write some metadata to ZooKeeper. You only need to write this data **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your Pulsar cluster, so the metadata can be initialized from a ZooKeeper, broker, or bookie machine. The following is an example: + +```shell + +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2181 \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 + +``` + +As you can see from the example above, you will need to specify the following: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--zookeeper` | A "local" ZooKeeper connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-store` | The configuration store connection string for the entire instance. As with the `--zookeeper` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (you had better not use a different port). +`--web-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster. The default port is 8443 (you had better not use a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should not use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (you had better not use a different port). +`--broker-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (you had better not use a different port). + + +> If you do not have a DNS server, you can use multi-host format in the service URL with the following settings: +> + +> ```shell +> +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> +> If you want to use an existing BookKeeper cluster, you can add the `--existing-bk-metadata-service-uri` flag as follows: +> + +> ```shell +> +> --existing-bk-metadata-service-uri "zk+null://zk1:2181;zk2:2181/ledgers" \ +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> +> +> ``` + +> You can obtain the metadata service URI of the existing BookKeeper cluster by using the `bin/bookkeeper shell whatisinstanceid` command. You must enclose the value in double quotes since the multiple metadata service URIs are separated with semicolons. + +## Deploy a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You need to deploy a cluster of BookKeeper bookies to use Pulsar. You can choose to run a **3-bookie BookKeeper cluster**. + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. The following is an example: + +```properties + +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +Once you appropriately modify the `zkServers` parameter, you can make any other configuration changes that you require. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper). However, consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide might be a better choice. + +Once you apply the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start bookie + +``` + +To start the bookie in the foreground: + +```bash + +$ bin/pulsar bookie + +``` + +You can verify that a bookie works properly by running the `bookiesanity` command on the [BookKeeper shell](reference-cli-tools.md#shell): + +```bash + +$ bin/bookkeeper shell bookiesanity + +``` + +This command creates an ephemeral BookKeeper ledger on the local bookie, writes a few entries, reads them back, and finally deletes the ledger. + +After you start all the bookies, you can use `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify all the bookies in the cluster are up running. + +```bash + +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries + +``` + +This command creates a `num-bookies` sized ledger on the cluster, writes a few entries, and finally deletes the ledger. + + +## Deploy Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide the administrative interface of Pulsar. A good choice is to run **3 brokers**, one for each machine that already runs a BookKeeper bookie. + +### Configure Brokers + +The most important element of broker configuration is ensuring that each broker is aware of the ZooKeeper cluster that you have deployed. Ensure that the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) and [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameters are correct. In this case, since you only have 1 cluster and no configuration store setup, the `configurationStoreServers` point to the same `zookeeperServers`. + +```properties + +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationStoreServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +``` + +You also need to specify the cluster name (matching the name that you provided when you [initialize the metadata of the cluster](#initialize-cluster-metadata)): + +```properties + +clusterName=pulsar-cluster-1 + +``` + +In addition, you need to match the broker and web service ports provided when you initialize the metadata of the cluster (especially when you use a different port than the default): + +```properties + +brokerServicePort=6650 +brokerServicePortTls=6651 +webServicePort=8080 +webServicePortTls=8443 + +``` + +> If you deploy Pulsar in a one-node cluster, you should update the replication settings in `conf/broker.conf` to `1`. +> + +> ```properties +> +> # Number of bookies to use when creating a ledger +> managedLedgerDefaultEnsembleSize=1 +> +> # Number of copies to store for each message +> managedLedgerDefaultWriteQuorum=1 +> +> # Number of guaranteed copies (acks to wait before write is complete) +> managedLedgerDefaultAckQuorum=1 +> +> +> ``` + + +### Enable Pulsar Functions (optional) + +If you want to enable [Pulsar Functions](functions-overview.md), you can follow the instructions as below: + +1. Edit `conf/broker.conf` to enable functions worker, by setting `functionsWorkerEnabled` to `true`. + + ```conf + + functionsWorkerEnabled=true + + ``` + +2. Edit `conf/functions_worker.yml` and set `pulsarFunctionsCluster` to the cluster name that you provide when you [initialize the metadata of the cluster](#initialize-cluster-metadata). + + ```conf + + pulsarFunctionsCluster: pulsar-cluster-1 + + ``` + +If you want to learn more options about deploying the functions worker, check out [Deploy and manage functions worker](functions-worker.md). + +### Start Brokers + +You can then provide any other configuration changes that you want in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you decide on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, you can start brokers either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash + +$ bin/pulsar broker + +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +$ bin/pulsar-daemon start broker + +``` + +Once you successfully start up all the brokers that you intend to use, your Pulsar cluster should be ready to go! + +## Connect to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provide a simple way to make sure that your cluster runs properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you assign to your broker/bookie hosts. The following is an example: + +```properties + +webServiceUrl=http://us-west.example.com:8080 +brokerServiceurl=pulsar://us-west.example.com:6650 + +``` + +> If you do not have a DNS server, you can specify multi-host in service URL as follows: +> + +> ```properties +> +> webServiceUrl=http://host1:8080,host2:8080,host3:8080 +> brokerServiceurl=pulsar://host1:6650,host2:6650,host3:6650 +> +> +> ``` + + +Once that is complete, you can publish a message to the Pulsar topic: + +```bash + +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello Pulsar" + +``` + +> You may need to use a different cluster name in the topic if you specify a cluster name other than `pulsar-cluster-1`. + +This command publishes a single message to the Pulsar topic. In addition, you can subscribe to the Pulsar topic in a different terminal before publishing messages as below: + +```bash + +$ bin/pulsar-client consume \ + persistent://public/default/test \ + -n 100 \ + -s "consumer-test" \ + -t "Exclusive" + +``` + +Once you successfully publish the above message to the topic, you should see it in the standard output: + +```bash + +----- got message ----- +Hello Pulsar + +``` + +## Run Functions + +> If you have [enabled](#enable-pulsar-functions-optional) Pulsar Functions, you can try out the Pulsar Functions now. + +Create an ExclamationFunction `exclamation`. + +```bash + +bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation + +``` + +Check whether the function runs as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. + +```bash + +bin/pulsar-admin functions trigger --name exclamation --trigger-value "hello world" + +``` + +You should see the following output: + +```shell + +hello world! + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-dcos.md b/site2/website/versioned_docs/version-2.9.x/deploy-dcos.md new file mode 100644 index 0000000000000..35a0a83d716ad --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-dcos.md @@ -0,0 +1,200 @@ +--- +id: deploy-dcos +title: Deploy Pulsar on DC/OS +sidebar_label: "DC/OS" +original_id: deploy-dcos +--- + +:::tip + +To enable all built-in [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, we recommend you use `apachepulsar/pulsar-all` image instead of `apachepulsar/pulsar` image; the former has already bundled [all built-in connectors](io-overview.md#working-with-connectors). + +::: + +[DC/OS](https://dcos.io/) (the DataCenter Operating System) is a distributed operating system for deploying and managing applications and systems on [Apache Mesos](http://mesos.apache.org/). DC/OS is an open-source tool created and maintained by [Mesosphere](https://mesosphere.com/). + +Apache Pulsar is available as a [Marathon Application Group](https://mesosphere.github.io/marathon/docs/application-groups.html), which runs multiple applications as manageable sets. + +## Prerequisites + +You need to prepare your environment before running Pulsar on DC/OS. + +* DC/OS version [1.9](https://docs.mesosphere.com/1.9/) or higher +* A [DC/OS cluster](https://docs.mesosphere.com/1.9/installing/) with at least three agent nodes +* The [DC/OS CLI tool](https://docs.mesosphere.com/1.9/cli/install/) installed +* The [`PulsarGroups.json`](https://github.com/apache/pulsar/blob/master/deployment/dcos/PulsarGroups.json) configuration file from the Pulsar GitHub repo. + + ```bash + + $ curl -O https://raw.githubusercontent.com/apache/pulsar/master/deployment/dcos/PulsarGroups.json + + ``` + +Each node in the DC/OS-managed Mesos cluster must have at least: + +* 4 CPU +* 4 GB of memory +* 60 GB of total persistent disk + +Alternatively, you can change the configuration in `PulsarGroups.json` accordingly to match your resources of the DC/OS cluster. + +## Deploy Pulsar using the DC/OS command interface + +You can deploy Pulsar on DC/OS using this command: + +```bash + +$ dcos marathon group add PulsarGroups.json + +``` + +This command deploys Docker container instances in three groups, which together comprise a Pulsar cluster: + +* 3 bookies (1 [bookie](reference-terminology.md#bookie) on each agent node and 1 [bookie recovery](http://bookkeeper.apache.org/docs/latest/admin/autorecovery/) instance) +* 3 Pulsar [brokers](reference-terminology.md#broker) (1 broker on each node and 1 admin instance) +* 1 [Prometheus](http://prometheus.io/) instance and 1 [Grafana](https://grafana.com/) instance + + +> When you run DC/OS, a ZooKeeper cluster will be running at `master.mesos:2181`, thus you do not have to install or start up ZooKeeper separately. + +After executing the `dcos` command above, click the **Services** tab in the DC/OS [GUI interface](https://docs.mesosphere.com/latest/gui/), which you can access at [http://m1.dcos](http://m1.dcos) in this example. You should see several applications during the deployment. + +![DC/OS command executed](/assets/dcos_command_execute.png) + +![DC/OS command executed2](/assets/dcos_command_execute2.png) + +## The BookKeeper group + +To monitor the status of the BookKeeper cluster deployment, click the **bookkeeper** group in the parent **pulsar** group. + +![DC/OS bookkeeper status](/assets/dcos_bookkeeper_status.png) + +At this point, the status of the 3 [bookies](reference-terminology.md#bookie) are green, which means that the bookies have been deployed successfully and are running. + +![DC/OS bookkeeper running](/assets/dcos_bookkeeper_run.png) + +You can also click each bookie instance to get more detailed information, such as the bookie running log. + +![DC/OS bookie log](/assets/dcos_bookie_log.png) + +To display information about the BookKeeper in ZooKeeper, you can visit [http://m1.dcos/exhibitor](http://m1.dcos/exhibitor). In this example, 3 bookies are under the `available` directory. + +![DC/OS bookkeeper in zk](/assets/dcos_bookkeeper_in_zookeeper.png) + +## The Pulsar broker group + +Similar to the BookKeeper group above, click **brokers** to check the status of the Pulsar brokers. + +![DC/OS broker status](/assets/dcos_broker_status.png) + +![DC/OS broker running](/assets/dcos_broker_run.png) + +You can also click each broker instance to get more detailed information, such as the broker running log. + +![DC/OS broker log](/assets/dcos_broker_log.png) + +Broker cluster information in ZooKeeper is also available through the web UI. In this example, you can see that the `loadbalance` and `managed-ledgers` directories have been created. + +![DC/OS broker in zk](/assets/dcos_broker_in_zookeeper.png) + +## Monitor group + +The **monitory** group consists of Prometheus and Grafana. + +![DC/OS monitor status](/assets/dcos_monitor_status.png) + +### Prometheus + +Click the instance of `prom` to get the endpoint of Prometheus, which is `192.168.65.121:9090` in this example. + +![DC/OS prom endpoint](/assets/dcos_prom_endpoint.png) + +If you click that endpoint, you can see the Prometheus dashboard. All the bookies and brokers are listed on [http://192.168.65.121:9090/targets](http://192.168.65.121:9090/targets). + +![DC/OS prom targets](/assets/dcos_prom_targets.png) + +### Grafana + +Click `grafana` to get the endpoint for Grafana, which is `192.168.65.121:3000` in this example. + +![DC/OS grafana endpoint](/assets/dcos_grafana_endpoint.png) + +If you click that endpoint, you can access the Grafana dashboard. + +![DC/OS grafana targets](/assets/dcos_grafana_dashboard.png) + +## Run a simple Pulsar consumer and producer on DC/OS + +Now that you have a fully deployed Pulsar cluster, you can run a simple consumer and producer to show Pulsar on DC/OS in action. + +### Download and prepare the Pulsar Java tutorial + +You can clone a [Pulsar Java tutorial](https://github.com/streamlio/pulsar-java-tutorial) repo. This repo contains a simple Pulsar consumer and producer (you can find more information in the `README` file in this repo). + +```bash + +$ git clone https://github.com/streamlio/pulsar-java-tutorial + +``` + +Change the `SERVICE_URL` from `pulsar://localhost:6650` to `pulsar://a1.dcos:6650` in both [`ConsumerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ConsumerTutorial.java) file and [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) file. + +The `pulsar://a1.dcos:6650` endpoint is for the broker service. You can fetch the endpoint details for each broker instance from the DC/OS GUI. `a1.dcos` is a DC/OS client agent that runs a broker, and you can replace it with the client agent IP address. + +Now, you can change the message number from 10 to 10000000 in the main method in [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) file to produce more messages. + +Then, you can compile the project code using the command below: + +```bash + +$ mvn clean package + +``` + +### Run the consumer and producer + +Execute this command to run the consumer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ConsumerTutorial" + +``` + +Execute this command to run the producer: + +```bash + +$ mvn exec:java -Dexec.mainClass="tutorial.ProducerTutorial" + +``` + +You see that the producer is producing messages and the consumer is consuming messages through the DC/OS GUI. + +![DC/OS pulsar producer](/assets/dcos_producer.png) + +![DC/OS pulsar consumer](/assets/dcos_consumer.png) + +### View Grafana metric output + +While the producer and consumer are running, you can access the running metrics from Grafana. + +![DC/OS pulsar dashboard](/assets/dcos_metrics.png) + + +## Uninstall Pulsar + +You can shut down and uninstall the `pulsar` application from DC/OS at any time in one of the following two ways: + +1. Click the three dots at the right end of Pulsar group and choose **Delete** on the DC/OS GUI. + + ![DC/OS pulsar uninstall](/assets/dcos_uninstall.png) + +2. Use the command below. + + ```bash + + $ dcos marathon group remove /pulsar + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-docker.md b/site2/website/versioned_docs/version-2.9.x/deploy-docker.md new file mode 100644 index 0000000000000..8348d78deb237 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-docker.md @@ -0,0 +1,60 @@ +--- +id: deploy-docker +title: Deploy a cluster on Docker +sidebar_label: "Docker" +original_id: deploy-docker +--- + +To deploy a Pulsar cluster on Docker, complete the following steps: +1. Deploy a ZooKeeper cluster (optional) +2. Initialize cluster metadata +3. Deploy a BookKeeper cluster +4. Deploy one or more Pulsar brokers + +## Prepare + +To run Pulsar on Docker, you need to create a container for each Pulsar component: ZooKeeper, BookKeeper and broker. You can pull the images of ZooKeeper and BookKeeper separately on [Docker Hub](https://hub.docker.com/), and pull a [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) for the broker. You can also pull only one [Pulsar image](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) and create three containers with this image. This tutorial takes the second option as an example. + +### Pull a Pulsar image +You can pull a Pulsar image from [Docker Hub](https://hub.docker.com/r/apachepulsar/pulsar-all/tags) with the following command. + +``` + +docker pull apachepulsar/pulsar-all:latest + +``` + +### Create three containers +Create containers for ZooKeeper, BookKeeper and broker. In this example, they are named as `zookeeper`, `bookkeeper` and `broker` respectively. You can name them as you want with the `--name` flag. By default, the container names are created randomly. + +``` + +docker run -it --name bookkeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name zookeeper apachepulsar/pulsar-all:latest /bin/bash +docker run -it --name broker apachepulsar/pulsar-all:latest /bin/bash + +``` + +### Create a network +To deploy a Pulsar cluster on Docker, you need to create a `network` and connect the containers of ZooKeeper, BookKeeper and broker to this network. The following command creates the network `pulsar`: + +``` + +docker network create pulsar + +``` + +### Connect containers to network +Connect the containers of ZooKeeper, BookKeeper and broker to the `pulsar` network with the following commands. + +``` + +docker network connect pulsar zookeeper +docker network connect pulsar bookkeeper +docker network connect pulsar broker + +``` + +To check whether the containers are successfully connected to the network, enter the `docker network inspect pulsar` command. + +For detailed information about how to deploy ZooKeeper cluster, BookKeeper cluster, brokers, see [deploy a cluster on bare metal](deploy-bare-metal.md). diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-kubernetes.md b/site2/website/versioned_docs/version-2.9.x/deploy-kubernetes.md new file mode 100644 index 0000000000000..b9d2d39e8f4ae --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-kubernetes.md @@ -0,0 +1,11 @@ +--- +id: deploy-kubernetes +title: Deploy Pulsar on Kubernetes +sidebar_label: "Kubernetes" +original_id: deploy-kubernetes +--- + +To get up and running with these charts as fast as possible, in a **non-production** use case, we provide +a [quick start guide](getting-started-helm.md for Proof of Concept (PoC) deployments. + +To configure and install a Pulsar cluster on Kubernetes for production usage, follow the complete [Installation Guide](helm-install.md). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/deploy-monitoring.md b/site2/website/versioned_docs/version-2.9.x/deploy-monitoring.md new file mode 100644 index 0000000000000..2b5c19344dc8c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/deploy-monitoring.md @@ -0,0 +1,148 @@ +--- +id: deploy-monitoring +title: Monitor +sidebar_label: "Monitor" +original_id: deploy-monitoring +--- + +You can use different ways to monitor a Pulsar cluster, exposing both metrics related to the usage of topics and the overall health of the individual components of the cluster. + +## Collect metrics + +You can collect broker stats, ZooKeeper stats, and BookKeeper stats. + +### Broker stats + +You can collect Pulsar broker metrics from brokers and export the metrics in JSON format. The Pulsar broker metrics mainly have two types: + +* *Destination dumps*, which contain stats for each individual topic. You can fetch the destination dumps using the command below: + + ```shell + + bin/pulsar-admin broker-stats destinations + + ``` + +* Broker metrics, which contain the broker information and topics stats aggregated at namespace level. You can fetch the broker metrics by using the following command: + + ```shell + + bin/pulsar-admin broker-stats monitoring-metrics + + ``` + +All the message rates are updated every minute. + +The aggregated broker metrics are also exposed in the [Prometheus](https://prometheus.io) format at: + +```shell + +http://$BROKER_ADDRESS:8080/metrics/ + +``` + +### ZooKeeper stats + +The local ZooKeeper, configuration store server and clients that are shipped with Pulsar can expose detailed stats through Prometheus. + +```shell + +http://$LOCAL_ZK_SERVER:8000/metrics +http://$GLOBAL_ZK_SERVER:8001/metrics + +``` + +The default port of local ZooKeeper is `8000` and the default port of the configuration store is `8001`. You can use a different stats port by configuring `metricsProvider.httpPort` in the `conf/zookeeper.conf` file. + +### BookKeeper stats + +You can configure the stats frameworks for BookKeeper by modifying the `statsProviderClass` in the `conf/bookkeeper.conf` file. + +The default BookKeeper configuration enables the Prometheus exporter. The configuration is included with Pulsar distribution. + +```shell + +http://$BOOKIE_ADDRESS:8000/metrics + +``` + +The default port for bookie is `8000`. You can change the port by configuring `prometheusStatsHttpPort` in the `conf/bookkeeper.conf` file. + +### Managed cursor acknowledgment state +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgement, you can configure the metrics for the managed cursor. + +``` + +brk_ml_cursor_persistLedgerSucceed(namespace=", ledger_name="", cursor_name:") +brk_ml_cursor_persistLedgerErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperSucceed(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_persistZookeeperErrors(namespace="", ledger_name="", cursor_name:"") +brk_ml_cursor_nonContiguousDeletedMessagesRange(namespace="", ledger_name="", cursor_name:"") + +``` + +Those metrics are added in the Prometheus interface, you can monitor and check the metrics stats in the Grafana. + +### Function and connector stats + +You can collect functions worker stats from `functions-worker` and export the metrics in JSON formats, which contain functions worker JVM metrics. + +``` + +pulsar-admin functions-worker monitoring-metrics + +``` + +You can collect functions and connectors metrics from `functions-worker` and export the metrics in JSON formats. + +``` + +pulsar-admin functions-worker function-stats + +``` + +The aggregated functions and connectors metrics can be exposed in Prometheus formats as below. You can get [`FUNCTIONS_WORKER_ADDRESS`](http://pulsar.apache.org/docs/en/next/functions-worker/) and `WORKER_PORT` from the `functions_worker.yml` file. + +``` + +http://$FUNCTIONS_WORKER_ADDRESS:$WORKER_PORT/metrics: + +``` + +## Configure Prometheus + +You can use Prometheus to collect all the metrics exposed for Pulsar components and set up [Grafana](https://grafana.com/) dashboards to display the metrics and monitor your Pulsar cluster. For details, refer to [Prometheus guide](https://prometheus.io/docs/introduction/getting_started/). + +When you run Pulsar on bare metal, you can provide the list of nodes to be probed. When you deploy Pulsar in a Kubernetes cluster, the monitoring is setup automatically. For details, refer to [Kubernetes instructions](helm-deploy.md). + +## Dashboards + +When you collect time series statistics, the major problem is to make sure the number of dimensions attached to the data does not explode. Thus you only need to collect time series of metrics aggregated at the namespace level. + +### Pulsar per-topic dashboard + +The per-topic dashboard instructions are available at [Pulsar manager](administration-pulsar-manager.md). + +### Grafana + +You can use grafana to create dashboard driven by the data that is stored in Prometheus. + +When you deploy Pulsar on Kubernetes, a `pulsar-grafana` Docker image is enabled by default. You can use the docker image with the principal dashboards. + +Enter the command below to use the dashboard manually: + +```shell + +docker run -p3000:3000 \ + -e PROMETHEUS_URL=http://$PROMETHEUS_HOST:9090/ \ + apachepulsar/pulsar-grafana:latest + +``` + +The following are some Grafana dashboards examples: + +- [pulsar-grafana](http://pulsar.apache.org/docs/en/deploy-monitoring/#grafana): a Grafana dashboard that displays metrics collected in Prometheus for Pulsar clusters running on Kubernetes. +- [apache-pulsar-grafana-dashboard](https://github.com/streamnative/apache-pulsar-grafana-dashboard): a collection of Grafana dashboard templates for different Pulsar components running on both Kubernetes and on-premise machines. + + ## Alerting rules + You can set alerting rules according to your Pulsar environment. To configure alerting rules for Apache Pulsar, refer to [alerting rules](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/develop-binary-protocol.md b/site2/website/versioned_docs/version-2.9.x/develop-binary-protocol.md new file mode 100644 index 0000000000000..dfddb344ed96b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/develop-binary-protocol.md @@ -0,0 +1,581 @@ +--- +id: develop-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: "Binary protocol" +original_id: develop-binary-protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a [`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:------------|:----------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:-------------|:--------------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| magicNumber | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| checksum | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| metadataSize | The size of the message [metadata](#message-metadata) | 4 | +| metadata | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| payload | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed on unchanged to the consumer. + +| Field | Description | +|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | The name of the producer that published the message | +| `sequence_id` | The sequence ID of the message, assigned by producer | +| `publish_time` | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` *(optional)* | Indicates that the message has been replicated and specifies the name of the [cluster](reference-terminology.md#cluster) where the message was originally published | +| `partition_key` *(optional)* | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose. Partition key is used as the message key. | +| `compression` *(optional)* | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` *(optional)* | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` *(optional)* | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Description | +|:--------------|:------------------------------------------------------------| +| metadataSizeN | The size of the single message metadata serialized Protobuf | +| metadataN | Single message metadata | +| payloadN | Message payload passed by application | + +Each metadata field looks like this; + +| Field | Description | +|:---------------------------|:--------------------------------------------------------| +| properties | Application-defined properties | +| partition key *(optional)* | Key to indicate the hashing to a particular partition | +| payload_size | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf + +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} + +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf + +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} + +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer id negotiated before. + +![Producer interaction](/assets/binary-protocol-producer.png) + +##### Command Producer + +```protobuf + +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf + +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} + +``` + +Parameters: + * `request_id` → Original id of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. This command is used in a frame that includes command +as well as message payload, for which the complete format is specified in the [payload commands](#payload-commands) section. + +```protobuf + +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} + +``` + +Parameters: + * `producer_id` → id of an existing producer + * `sequence_id` → each message has an associated sequence id which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of a messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + +```protobuf + +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `producer_id` → id of producer originating the send request + * `sequence_id` → sequence id of the published message + * `message_id` → message id assigned by the system to the published message + Unique within a single cluster. Message id is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique id is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/assets/binary-protocol-consumer.png) + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued half of the messages in the queue, the consumer +sends permits to the broker to ask for more messages (equals to half of the messages in the queue). + +For example, if the queue size is 1000 and the consumer consumes 500 messages in the queue. +Then the consumer sends permits to the broker to ask for 500 messages. + +##### Command Subscribe + +```protobuf + +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover, Key_Shared + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf + +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} + +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf + +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf + +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + +##### Command CloseConsumer + +***Note***: **This command can be sent by either producer or broker*. + +This command behaves the same as [`CloseProducer`](#command-closeproducer) + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retrieve Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the [admin API](admin-api-topics.md#look-up-topics-owner-broker) +docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/assets/binary-protocol-topic-lookup.png) + +```protobuf + +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} + +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} + +``` + +Example of lookup response with redirection: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} + +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf + +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} + +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf + +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} + +``` + +## Protobuf interface + +All Pulsar's Protobuf definitions can be found {@inject: github:here:/pulsar-common/src/main/proto/PulsarApi.proto}. diff --git a/site2/website/versioned_docs/version-2.9.x/develop-load-manager.md b/site2/website/versioned_docs/version-2.9.x/develop-load-manager.md new file mode 100644 index 0000000000000..509209b6a852d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/develop-load-manager.md @@ -0,0 +1,227 @@ +--- +id: develop-load-manager +title: Modular load manager +sidebar_label: "Modular load manager" +original_id: develop-load-manager +--- + +The *modular load manager*, implemented in [`ModularLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/ModularLoadManagerImpl.java), is a flexible alternative to the previously implemented load manager, [`SimpleLoadManagerImpl`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/SimpleLoadManagerImpl.java), which attempts to simplify how load is managed while also providing abstractions so that complex load management strategies may be implemented. + +## Usage + +There are two ways that you can enable the modular load manager: + +1. Change the value of the `loadManagerClassName` parameter in `conf/broker.conf` from `org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl` to `org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl`. +2. Using the `pulsar-admin` tool. Here's an example: + + ```shell + + $ pulsar-admin update-dynamic-config \ + --config loadManagerClassName \ + --value org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl + + ``` + + You can use the same method to change back to the original value. In either case, any mistake in specifying the load manager will cause Pulsar to default to `SimpleLoadManagerImpl`. + +## Verification + +There are a few different ways to determine which load manager is being used: + +1. Use `pulsar-admin` to examine the `loadManagerClassName` element: + + ```shell + + $ bin/pulsar-admin brokers get-all-dynamic-config + { + "loadManagerClassName" : "org.apache.pulsar.broker.loadbalance.impl.ModularLoadManagerImpl" + } + + ``` + + If there is no `loadManagerClassName` element, then the default load manager is used. + +2. Consult a ZooKeeper load report. With the module load manager, the load report in `/loadbalance/brokers/...` will have many differences. for example the `systemResourceUsage` sub-elements (`bandwidthIn`, `bandwidthOut`, etc.) are now all at the top level. Here is an example load report from the module load manager: + + ```json + + { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 4.256510416666667 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 5.287239583333333 + }, + "bundles": [], + "cpu": { + "limit": 2400.0, + "usage": 5.7353247655435915 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + } + } + + ``` + + With the simple load manager, the load report in `/loadbalance/brokers/...` will look like this: + + ```json + + { + "systemResourceUsage": { + "bandwidthIn": { + "limit": 10240000.0, + "usage": 0.0 + }, + "bandwidthOut": { + "limit": 10240000.0, + "usage": 0.0 + }, + "cpu": { + "limit": 2400.0, + "usage": 0.0 + }, + "directMemory": { + "limit": 16384.0, + "usage": 1.0 + }, + "memory": { + "limit": 8192.0, + "usage": 3903.0 + } + } + } + + ``` + +3. The command-line [broker monitor](reference-cli-tools.md#monitor-brokers) will have a different output format depending on which load manager implementation is being used. + + Here is an example from the modular load manager: + + ``` + + =================================================================================================================== + ||SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.00 |48.33 |0.01 |0.00 |0.00 |48.33 || + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |4 |0 || + ||LATEST |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||SHORT |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + ||LONG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.00 |0.00 |0.00 || + =================================================================================================================== + + ``` + + Here is an example from the simple load manager: + + ``` + + =================================================================================================================== + ||COUNT |TOPIC |BUNDLE |PRODUCER |CONSUMER |BUNDLE + |BUNDLE - || + || |4 |4 |0 |2 |0 |0 || + ||RAW SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.25 |47.94 |0.01 |0.00 |0.00 |47.94 || + ||ALLOC SYSTEM |CPU % |MEMORY % |DIRECT % |BW IN % |BW OUT % |MAX % || + || |0.20 |1.89 | |1.27 |3.21 |3.21 || + ||RAW MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |0.00 |0.00 |0.00 |0.01 |0.01 |0.01 || + ||ALLOC MSG |MSG/S IN |MSG/S OUT |TOTAL |KB/S IN |KB/S OUT |TOTAL || + || |54.84 |134.48 |189.31 |126.54 |320.96 |447.50 || + =================================================================================================================== + + ``` + +It is important to note that the module load manager is _centralized_, meaning that all requests to assign a bundle---whether it's been seen before or whether this is the first time---only get handled by the _lead_ broker (which can change over time). To determine the current lead broker, examine the `/loadbalance/leader` node in ZooKeeper. + +## Implementation + +### Data + +The data monitored by the modular load manager is contained in the [`LoadData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/LoadData.java) class. +Here, the available data is subdivided into the bundle data and the broker data. + +#### Broker + +The broker data is contained in the [`BrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BrokerData.java) class. It is further subdivided into two parts, +one being the local data which every broker individually writes to ZooKeeper, and the other being the historical broker +data which is written to ZooKeeper by the leader broker. + +##### Local Broker Data +The local broker data is contained in the class [`LocalBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java) and provides information about the following resources: + +* CPU usage +* JVM heap memory usage +* Direct memory usage +* Bandwidth in/out usage +* Most recent total message rate in/out across all bundles +* Total number of topics, bundles, producers, and consumers +* Names of all bundles assigned to this broker +* Most recent changes in bundle assignments for this broker + +The local broker data is updated periodically according to the service configuration +"loadBalancerReportUpdateMaxIntervalMinutes". After any broker updates their local broker data, the leader broker will +receive the update immediately via a ZooKeeper watch, where the local data is read from the ZooKeeper node +`/loadbalance/brokers/` + +##### Historical Broker Data + +The historical broker data is contained in the [`TimeAverageBrokerData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/TimeAverageBrokerData.java) class. + +In order to reconcile the need to make good decisions in a steady-state scenario and make reactive decisions in a critical scenario, the historical data is split into two parts: the short-term data for reactive decisions, and the long-term data for steady-state decisions. Both time frames maintain the following information: + +* Message rate in/out for the entire broker +* Message throughput in/out for the entire broker + +Unlike the bundle data, the broker data does not maintain samples for the global broker message rates and throughputs, which is not expected to remain steady as new bundles are removed or added. Instead, this data is aggregated over the short-term and long-term data for the bundles. See the section on bundle data to understand how that data is collected and maintained. + +The historical broker data is updated for each broker in memory by the leader broker whenever any broker writes their local data to ZooKeeper. Then, the historical data is written to ZooKeeper by the leader broker periodically according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +##### Bundle Data + +The bundle data is contained in the [`BundleData`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/BundleData.java). Like the historical broker data, the bundle data is split into a short-term and a long-term time frame. The information maintained in each time frame: + +* Message rate in/out for this bundle +* Message Throughput In/Out for this bundle +* Current number of samples for this bundle + +The time frames are implemented by maintaining the average of these values over a set, limited number of samples, where +the samples are obtained through the message rate and throughput values in the local data. Thus, if the update interval +for the local data is 2 minutes, the number of short samples is 10 and the number of long samples is 1000, the +short-term data is maintained over a period of `10 samples * 2 minutes / sample = 20 minutes`, while the long-term +data is similarly over a period of 2000 minutes. Whenever there are not enough samples to satisfy a given time frame, +the average is taken only over the existing samples. When no samples are available, default values are assumed until +they are overwritten by the first sample. Currently, the default values are + +* Message rate in/out: 50 messages per second both ways +* Message throughput in/out: 50KB per second both ways + +The bundle data is updated in memory on the leader broker whenever any broker writes their local data to ZooKeeper. +Then, the bundle data is written to ZooKeeper by the leader broker periodically at the same time as the historical +broker data, according to the configuration `loadBalancerResourceQuotaUpdateIntervalMinutes`. + +### Traffic Distribution + +The modular load manager uses the abstraction provided by [`ModularLoadManagerStrategy`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/ModularLoadManagerStrategy.java) to make decisions about bundle assignment. The strategy makes a decision by considering the service configuration, the entire load data, and the bundle data for the bundle to be assigned. Currently, the only supported strategy is [`LeastLongTermMessageRate`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LeastLongTermMessageRate.java), though soon users will have the ability to inject their own strategies if desired. + +#### Least Long Term Message Rate Strategy + +As its name suggests, the least long term message rate strategy attempts to distribute bundles across brokers so that +the message rate in the long-term time window for each broker is roughly the same. However, simply balancing load based +on message rate does not handle the issue of asymmetric resource burden per message on each broker. Thus, the system +resource usages, which are CPU, memory, direct memory, bandwidth in, and bandwidth out, are also considered in the +assignment process. This is done by weighting the final message rate according to +`1 / (overload_threshold - max_usage)`, where `overload_threshold` corresponds to the configuration +`loadBalancerBrokerOverloadedThresholdPercentage` and `max_usage` is the maximum proportion among the system resources +that is being utilized by the candidate broker. This multiplier ensures that machines with are being more heavily taxed +by the same message rates will receive less load. In particular, it tries to ensure that if one machine is overloaded, +then all machines are approximately overloaded. In the case in which a broker's max usage exceeds the overload +threshold, that broker is not considered for bundle assignment. If all brokers are overloaded, the bundle is randomly +assigned. + diff --git a/site2/website/versioned_docs/version-2.9.x/develop-schema.md b/site2/website/versioned_docs/version-2.9.x/develop-schema.md new file mode 100644 index 0000000000000..2d4461a5ea2b5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/develop-schema.md @@ -0,0 +1,62 @@ +--- +id: develop-schema +title: Custom schema storage +sidebar_label: "Custom schema storage" +original_id: develop-schema +--- + +By default, Pulsar stores data type [schemas](concepts-schema-registry.md) in [Apache BookKeeper](https://bookkeeper.apache.org) (which is deployed alongside Pulsar). You can, however, use another storage system if you wish. This doc walks you through creating your own schema storage implementation. + +In order to use a non-default (i.e. non-BookKeeper) storage system for Pulsar schemas, you need to implement two Java interfaces: [`SchemaStorage`](#schemastorage-interface) and [`SchemaStorageFactory`](#schemastoragefactory-interface). + +## SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +> For a full-fledged example schema storage implementation, see the [`BookKeeperSchemaStorage`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +## SchemaStorageFactory interface + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +> For a full-fledged example schema storage factory implementation, see the [`BookKeeperSchemaStorageFactory`](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +## Deployment + +In order to use your custom schema storage implementation, you'll need to: + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. +1. Add that jar to the `lib` folder in your Pulsar [binary or source distribution](getting-started-standalone.md#installing-pulsar). +1. Change the `schemaRegistryStorageClassName` configuration in [`broker.conf`](reference-configuration.md#broker) to your custom factory class (i.e. the `SchemaStorageFactory` implementation, not the `SchemaStorage` implementation). +1. Start up Pulsar. diff --git a/site2/website/versioned_docs/version-2.9.x/develop-tools.md b/site2/website/versioned_docs/version-2.9.x/develop-tools.md new file mode 100644 index 0000000000000..bc7c29e836e6a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/develop-tools.md @@ -0,0 +1,112 @@ +--- +id: develop-tools +title: Simulation tools +sidebar_label: "Simulation tools" +original_id: develop-tools +--- + +It is sometimes necessary create an test environment and incur artificial load to observe how well load managers +handle the load. The load simulation controller, the load simulation client, and the broker monitor were created as an +effort to make create this load and observe the effects on the managers more easily. + +## Simulation Client +The simulation client is a machine which will create and subscribe to topics with configurable message rates and sizes. +Because it is sometimes necessary in simulating large load to use multiple client machines, the user does not interact +with the simulation client directly, but instead delegates their requests to the simulation controller, which will then +send signals to clients to start incurring load. The client implementation is in the class +`org.apache.pulsar.testclient.LoadSimulationClient`. + +### Usage +To Start a simulation client, use the `pulsar-perf` script with the command `simulation-client` as follows: + +``` + +pulsar-perf simulation-client --port --service-url + +``` + +The client will then be ready to receive controller commands. +## Simulation Controller +The simulation controller send signals to the simulation clients, requesting them to create new topics, stop old +topics, change the load incurred by topics, as well as several other tasks. It is implemented in the class +`org.apache.pulsar.testclient.LoadSimulationController` and presents a shell to the user as an interface to send +command with. + +### Usage +To start a simulation controller, use the `pulsar-perf` script with the command `simulation-controller` as follows: + +``` + +pulsar-perf simulation-controller --cluster --client-port +--clients + +``` + +The clients should already be started before the controller is started. You will then be presented with a simple prompt, +where you can issue commands to simulation clients. Arguments often refer to tenant names, namespace names, and topic +names. In all cases, the BASE name of the tenants, namespaces, and topics are used. For example, for the topic +`persistent://my_tenant/my_cluster/my_namespace/my_topic`, the tenant name is `my_tenant`, the namespace name is +`my_namespace`, and the topic name is `my_topic`. The controller can perform the following actions: + +* Create a topic with a producer and a consumer + * `trade [--rate ] + [--rand-rate ,] + [--size ]` +* Create a group of topics with a producer and a consumer + * `trade_group [--rate ] + [--rand-rate ,] + [--separation ] [--size ] + [--topics-per-namespace ]` +* Change the configuration of an existing topic + * `change [--rate ] + [--rand-rate ,] + [--size ]` +* Change the configuration of a group of topics + * `change_group [--rate ] [--rand-rate ,] + [--size ] [--topics-per-namespace ]` +* Shutdown a previously created topic + * `stop ` +* Shutdown a previously created group of topics + * `stop_group ` +* Copy the historical data from one ZooKeeper to another and simulate based on the message rates and sizes in that +history + * `copy [--rate-multiplier value]` +* Simulate the load of the historical data on the current ZooKeeper (should be same ZooKeeper being simulated on) + * `simulate [--rate-multiplier value]` +* Stream the latest data from the given active ZooKeeper to simulate the real-time load of that ZooKeeper. + * `stream [--rate-multiplier value]` + +The "group" arguments in these commands allow the user to create or affect multiple topics at once. Groups are created +when calling the `trade_group` command, and all topics from these groups may be subsequently modified or stopped +with the `change_group` and `stop_group` commands respectively. All ZooKeeper arguments are of the form +`zookeeper_host:port`. + +### Difference Between Copy, Simulate, and Stream +The commands `copy`, `simulate`, and `stream` are very similar but have significant differences. `copy` is used when +you want to simulate the load of a static, external ZooKeeper on the ZooKeeper you are simulating on. Thus, +`source zookeeper` should be the ZooKeeper you want to copy and `target zookeeper` should be the ZooKeeper you are +simulating on, and then it will get the full benefit of the historical data of the source in both load manager +implementations. `simulate` on the other hand takes in only one ZooKeeper, the one you are simulating on. It assumes +that you are simulating on a ZooKeeper that has historical data for `SimpleLoadManagerImpl` and creates equivalent +historical data for `ModularLoadManagerImpl`. Then, the load according to the historical data is simulated by the +clients. Finally, `stream` takes in an active ZooKeeper different than the ZooKeeper being simulated on and streams +load data from it and simulates the real-time load. In all cases, the optional `rate-multiplier` argument allows the +user to simulate some proportion of the load. For instance, using `--rate-multiplier 0.05` will cause messages to +be sent at only `5%` of the rate of the load that is being simulated. + +## Broker Monitor +To observe the behavior of the load manager in these simulations, one may utilize the broker monitor, which is +implemented in `org.apache.pulsar.testclient.BrokerMonitor`. The broker monitor will print tabular load data to the +console as it is updated using watchers. + +### Usage +To start a broker monitor, use the `monitor-brokers` command in the `pulsar-perf` script: + +``` + +pulsar-perf monitor-brokers --connect-string + +``` + +The console will then continuously print load data until it is interrupted. + diff --git a/site2/website/versioned_docs/version-2.9.x/developing-binary-protocol.md b/site2/website/versioned_docs/version-2.9.x/developing-binary-protocol.md new file mode 100644 index 0000000000000..a18a8b8d56172 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/developing-binary-protocol.md @@ -0,0 +1,606 @@ +--- +id: developing-binary-protocol +title: Pulsar binary protocol specification +sidebar_label: "Binary protocol" +original_id: developing-binary-protocol +--- + +Pulsar uses a custom binary protocol for communications between producers/consumers and brokers. This protocol is designed to support required features, such as acknowledgements and flow control, while ensuring maximum transport and implementation efficiency. + +Clients and brokers exchange *commands* with each other. Commands are formatted as binary [protocol buffer](https://developers.google.com/protocol-buffers/) (aka *protobuf*) messages. The format of protobuf commands is specified in the [`PulsarApi.proto`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto) file and also documented in the [Protobuf interface](#protobuf-interface) section below. + +> ### Connection sharing +> Commands for different producers and consumers can be interleaved and sent through the same connection without restriction. + +All commands associated with Pulsar's protocol are contained in a [`BaseCommand`](#pulsar.proto.BaseCommand) protobuf message that includes a [`Type`](#pulsar.proto.Type) [enum](https://developers.google.com/protocol-buffers/docs/proto#enum) with all possible subcommands as optional fields. `BaseCommand` messages can specify only one subcommand. + +## Framing + +Since protobuf doesn't provide any sort of message frame, all messages in the Pulsar protocol are prepended with a 4-byte field that specifies the size of the frame. The maximum allowable size of a single frame is 5 MB. + +The Pulsar protocol allows for two types of commands: + +1. **Simple commands** that do not carry a message payload. +2. **Payload commands** that bear a payload that is used when publishing or delivering messages. In payload commands, the protobuf command data is followed by protobuf [metadata](#message-metadata) and then the payload, which is passed in raw format outside of protobuf. All sizes are passed as 4-byte unsigned big endian integers. + +> Message payloads are passed in raw format rather than protobuf format for efficiency reasons. + +### Simple commands + +Simple (payload-free) commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:------------|:----------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | + +### Payload commands + +Payload commands have this basic structure: + +| Component | Description | Size (in bytes) | +|:-------------|:--------------------------------------------------------------------------------------------|:----------------| +| totalSize | The size of the frame, counting everything that comes after it (in bytes) | 4 | +| commandSize | The size of the protobuf-serialized command | 4 | +| message | The protobuf message serialized in a raw binary format (rather than in protobuf format) | | +| magicNumber | A 2-byte byte array (`0x0e01`) identifying the current format | 2 | +| checksum | A [CRC32-C checksum](http://www.evanjones.ca/crc32c.html) of everything that comes after it | 4 | +| metadataSize | The size of the message [metadata](#message-metadata) | 4 | +| metadata | The message [metadata](#message-metadata) stored as a binary protobuf message | | +| payload | Anything left in the frame is considered the payload and can include any sequence of bytes | | + +## Message metadata + +Message metadata is stored alongside the application-specified payload as a serialized protobuf message. Metadata is created by the producer and passed on unchanged to the consumer. + +| Field | Description | +|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `producer_name` | The name of the producer that published the message | +| `sequence_id` | The sequence ID of the message, assigned by producer | +| `publish_time` | The publish timestamp in Unix time (i.e. as the number of milliseconds since January 1st, 1970 in UTC) | +| `properties` | A sequence of key/value pairs (using the [`KeyValue`](https://github.com/apache/pulsar/blob/master/pulsar-common/src/main/proto/PulsarApi.proto#L32) message). These are application-defined keys and values with no special meaning to Pulsar. | +| `replicated_from` *(optional)* | Indicates that the message has been replicated and specifies the name of the [cluster](reference-terminology.md#cluster) where the message was originally published | +| `partition_key` *(optional)* | While publishing on a partition topic, if the key is present, the hash of the key is used to determine which partition to choose. Partition key is used as the message key. | +| `compression` *(optional)* | Signals that payload has been compressed and with which compression library | +| `uncompressed_size` *(optional)* | If compression is used, the producer must fill the uncompressed size field with the original payload size | +| `num_messages_in_batch` *(optional)* | If this message is really a [batch](#batch-messages) of multiple entries, this field must be set to the number of messages in the batch | + +### Batch messages + +When using batch messages, the payload will be containing a list of entries, +each of them with its individual metadata, defined by the `SingleMessageMetadata` +object. + + +For a single batch, the payload format will look like this: + + +| Field | Description | +|:--------------|:------------------------------------------------------------| +| metadataSizeN | The size of the single message metadata serialized Protobuf | +| metadataN | Single message metadata | +| payloadN | Message payload passed by application | + +Each metadata field looks like this; + +| Field | Description | +|:---------------------------|:--------------------------------------------------------| +| properties | Application-defined properties | +| partition key *(optional)* | Key to indicate the hashing to a particular partition | +| payload_size | Size of the payload for the single message in the batch | + +When compression is enabled, the whole batch will be compressed at once. + +## Interactions + +### Connection establishment + +After opening a TCP connection to a broker, typically on port 6650, the client +is responsible to initiate the session. + +![Connect interaction](/assets/binary-protocol-connect.png) + +After receiving a `Connected` response from the broker, the client can +consider the connection ready to use. Alternatively, if the broker doesn't +validate the client authentication, it will reply with an `Error` command and +close the TCP connection. + +Example: + +```protobuf + +message CommandConnect { + "client_version" : "Pulsar-Client-Java-v1.15.2", + "auth_method_name" : "my-authentication-plugin", + "auth_data" : "my-auth-data", + "protocol_version" : 6 +} + +``` + +Fields: + * `client_version` → String based identifier. Format is not enforced + * `auth_method_name` → *(optional)* Name of the authentication plugin if auth + enabled + * `auth_data` → *(optional)* Plugin specific authentication data + * `protocol_version` → Indicates the protocol version supported by the + client. Broker will not send commands introduced in newer revisions of the + protocol. Broker might be enforcing a minimum version + +```protobuf + +message CommandConnected { + "server_version" : "Pulsar-Broker-v1.15.2", + "protocol_version" : 6 +} + +``` + +Fields: + * `server_version` → String identifier of broker version + * `protocol_version` → Protocol version supported by the broker. Client + must not attempt to send commands introduced in newer revisions of the + protocol + +### Keep Alive + +To identify prolonged network partitions between clients and brokers or cases +in which a machine crashes without interrupting the TCP connection on the remote +end (eg: power outage, kernel panic, hard reboot...), we have introduced a +mechanism to probe for the availability status of the remote peer. + +Both clients and brokers are sending `Ping` commands periodically and they will +close the socket if a `Pong` response is not received within a timeout (default +used by broker is 60s). + +A valid implementation of a Pulsar client is not required to send the `Ping` +probe, though it is required to promptly reply after receiving one from the +broker in order to prevent the remote side from forcibly closing the TCP connection. + + +### Producer + +In order to send messages, a client needs to establish a producer. When creating +a producer, the broker will first verify that this particular client is +authorized to publish on the topic. + +Once the client gets confirmation of the producer creation, it can publish +messages to the broker, referring to the producer id negotiated before. + +![Producer interaction](/assets/binary-protocol-producer.png) + +##### Command Producer + +```protobuf + +message CommandProducer { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "producer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the producer on + * `producer_id` → Client generated producer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `producer_name` → *(optional)* If a producer name is specified, the name will + be used, otherwise the broker will generate a unique name. Generated + producer name is guaranteed to be globally unique. Implementations are + expected to let the broker generate a new producer name when the producer + is initially created, then reuse it when recreating the producer after + reconnections. + +The broker will reply with either `ProducerSuccess` or `Error` commands. + +##### Command ProducerSuccess + +```protobuf + +message CommandProducerSuccess { + "request_id" : 1, + "producer_name" : "generated-unique-producer-name" +} + +``` + +Parameters: + * `request_id` → Original id of the `CreateProducer` request + * `producer_name` → Generated globally unique producer name or the name + specified by the client, if any. + +##### Command Send + +Command `Send` is used to publish a new message within the context of an +already existing producer. This command is used in a frame that includes command +as well as message payload, for which the complete format is specified in the [payload commands](#payload-commands) section. + +```protobuf + +message CommandSend { + "producer_id" : 1, + "sequence_id" : 0, + "num_messages" : 1 +} + +``` + +Parameters: + * `producer_id` → id of an existing producer + * `sequence_id` → each message has an associated sequence id which is expected + to be implemented with a counter starting at 0. The `SendReceipt` that + acknowledges the effective publishing of messages will refer to it by + its sequence id. + * `num_messages` → *(optional)* Used when publishing a batch of messages at + once. + +##### Command SendReceipt + +After a message has been persisted on the configured number of replicas, the +broker will send the acknowledgment receipt to the producer. + +```protobuf + +message CommandSendReceipt { + "producer_id" : 1, + "sequence_id" : 0, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `producer_id` → id of producer originating the send request + * `sequence_id` → sequence id of the published message + * `message_id` → message id assigned by the system to the published message + Unique within a single cluster. Message id is composed of 2 longs, `ledgerId` + and `entryId`, that reflect that this unique id is assigned when appending + to a BookKeeper ledger + + +##### Command CloseProducer + +**Note**: *This command can be sent by either producer or broker*. + +When receiving a `CloseProducer` command, the broker will stop accepting any +more messages for the producer, wait until all pending messages are persisted +and then reply `Success` to the client. + +The broker can send a `CloseProducer` command to client when it's performing +a graceful failover (eg: broker is being restarted, or the topic is being unloaded +by load balancer to be transferred to a different broker). + +When receiving the `CloseProducer`, the client is expected to go through the +service discovery lookup again and recreate the producer again. The TCP +connection is not affected. + +### Consumer + +A consumer is used to attach to a subscription and consume messages from it. +After every reconnection, a client needs to subscribe to the topic. If a +subscription is not already there, a new one will be created. + +![Consumer](/assets/binary-protocol-consumer.png) + +#### Flow control + +After the consumer is ready, the client needs to *give permission* to the +broker to push messages. This is done with the `Flow` command. + +A `Flow` command gives additional *permits* to send messages to the consumer. +A typical consumer implementation will use a queue to accumulate these messages +before the application is ready to consume them. + +After the application has dequeued half of the messages in the queue, the consumer +sends permits to the broker to ask for more messages (equals to half of the messages in the queue). + +For example, if the queue size is 1000 and the consumer consumes 500 messages in the queue. +Then the consumer sends permits to the broker to ask for 500 messages. + +##### Command Subscribe + +```protobuf + +message CommandSubscribe { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "subscription" : "my-subscription-name", + "subType" : "Exclusive", + "consumer_id" : 1, + "request_id" : 1 +} + +``` + +Parameters: + * `topic` → Complete topic name to where you want to create the consumer on + * `subscription` → Subscription name + * `subType` → Subscription type: Exclusive, Shared, Failover, Key_Shared + * `consumer_id` → Client generated consumer identifier. Needs to be unique + within the same connection + * `request_id` → Identifier for this request. Used to match the response with + the originating request. Needs to be unique within the same connection + * `consumer_name` → *(optional)* Clients can specify a consumer name. This + name can be used to track a particular consumer in the stats. Also, in + Failover subscription type, the name is used to decide which consumer is + elected as *master* (the one receiving messages): consumers are sorted by + their consumer name and the first one is elected master. + +##### Command Flow + +```protobuf + +message CommandFlow { + "consumer_id" : 1, + "messagePermits" : 1000 +} + +``` + +Parameters: +* `consumer_id` → Id of an already established consumer +* `messagePermits` → Number of additional permits to grant to the broker for + pushing more messages + +##### Command Message + +Command `Message` is used by the broker to push messages to an existing consumer, +within the limits of the given permits. + + +This command is used in a frame that includes the message payload as well, for +which the complete format is specified in the [payload commands](#payload-commands) +section. + +```protobuf + +message CommandMessage { + "consumer_id" : 1, + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +##### Command Ack + +An `Ack` is used to signal to the broker that a given message has been +successfully processed by the application and can be discarded by the broker. + +In addition, the broker will also maintain the consumer position based on the +acknowledged messages. + +```protobuf + +message CommandAck { + "consumer_id" : 1, + "ack_type" : "Individual", + "message_id" : { + "ledgerId" : 123, + "entryId" : 456 + } +} + +``` + +Parameters: + * `consumer_id` → Id of an already established consumer + * `ack_type` → Type of acknowledgment: `Individual` or `Cumulative` + * `message_id` → Id of the message to acknowledge + * `validation_error` → *(optional)* Indicates that the consumer has discarded + the messages due to: `UncompressedSizeCorruption`, + `DecompressionError`, `ChecksumMismatch`, `BatchDeSerializeError` + * `properties` → *(optional)* Reserved configuration items + * `txnid_most_bits` → *(optional)* Same as Transaction Coordinator ID, `txnid_most_bits` and `txnid_least_bits` + uniquely identify a transaction. + * `txnid_least_bits` → *(optional)* The ID of the transaction opened in a transaction coordinator, + `txnid_most_bits` and `txnid_least_bits`uniquely identify a transaction. + * `request_id` → *(optional)* ID for handling response and timeout. + + + ##### Command AckResponse + +An `AckResponse` is the broker’s response to acknowledge a request sent by the client. It contains the `consumer_id` sent in the request. +If a transaction is used, it contains both the Transaction ID and the Request ID that are sent in the request. The client finishes the specific request according to the Request ID. If the `error` field is set, it indicates that the request has failed. + +An example of `AckResponse` with redirection: + +```protobuf + +message CommandAckResponse { + "consumer_id" : 1, + "txnid_least_bits" = 0, + "txnid_most_bits" = 1, + "request_id" = 5 +} + +``` + +##### Command CloseConsumer + +***Note***: **This command can be sent by either producer or broker*. + +This command behaves the same as [`CloseProducer`](#command-closeproducer) + +##### Command RedeliverUnacknowledgedMessages + +A consumer can ask the broker to redeliver some or all of the pending messages +that were pushed to that particular consumer and not yet acknowledged. + +The protobuf object accepts a list of message ids that the consumer wants to +be redelivered. If the list is empty, the broker will redeliver all the +pending messages. + +On redelivery, messages can be sent to the same consumer or, in the case of a +shared subscription, spread across all available consumers. + + +##### Command ReachedEndOfTopic + +This is sent by a broker to a particular consumer, whenever the topic +has been "terminated" and all the messages on the subscription were +acknowledged. + +The client should use this command to notify the application that no more +messages are coming from the consumer. + +##### Command ConsumerStats + +This command is sent by the client to retrieve Subscriber and Consumer level +stats from the broker. +Parameters: + * `request_id` → Id of the request, used to correlate the request + and the response. + * `consumer_id` → Id of an already established consumer. + +##### Command ConsumerStatsResponse + +This is the broker's response to ConsumerStats request by the client. +It contains the Subscriber and Consumer level stats of the `consumer_id` sent in the request. +If the `error_code` or the `error_message` field is set it indicates that the request has failed. + +##### Command Unsubscribe + +This command is sent by the client to unsubscribe the `consumer_id` from the associated topic. +Parameters: + * `request_id` → Id of the request. + * `consumer_id` → Id of an already established consumer which needs to unsubscribe. + + +## Service discovery + +### Topic lookup + +Topic lookup needs to be performed each time a client needs to create or +reconnect a producer or a consumer. Lookup is used to discover which particular +broker is serving the topic we are about to use. + +Lookup can be done with a REST call as described in the [admin API](admin-api-topics.md#lookup-of-topic) +docs. + +Since Pulsar-1.16 it is also possible to perform the lookup within the binary +protocol. + +For the sake of example, let's assume we have a service discovery component +running at `pulsar://broker.example.com:6650` + +Individual brokers will be running at `pulsar://broker-1.example.com:6650`, +`pulsar://broker-2.example.com:6650`, ... + +A client can use a connection to the discovery service host to issue a +`LookupTopic` command. The response can either be a broker hostname to +connect to, or a broker hostname to which retry the lookup. + +The `LookupTopic` command has to be used in a connection that has already +gone through the `Connect` / `Connected` initial handshake. + +![Topic lookup](/assets/binary-protocol-topic-lookup.png) + +```protobuf + +message CommandLookupTopic { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1, + "authoritative" : false +} + +``` + +Fields: + * `topic` → Topic name to lookup + * `request_id` → Id of the request that will be passed with its response + * `authoritative` → Initial lookup request should use false. When following a + redirect response, client should pass the same value contained in the + response + +##### LookupTopicResponse + +Example of response with successful lookup: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Connect", + "brokerServiceUrl" : "pulsar://broker-1.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-1.example.com:6651", + "authoritative" : true +} + +``` + +Example of lookup response with redirection: + +```protobuf + +message CommandLookupTopicResponse { + "request_id" : 1, + "response" : "Redirect", + "brokerServiceUrl" : "pulsar://broker-2.example.com:6650", + "brokerServiceUrlTls" : "pulsar+ssl://broker-2.example.com:6651", + "authoritative" : true +} + +``` + +In this second case, we need to reissue the `LookupTopic` command request +to `broker-2.example.com` and this broker will be able to give a definitive +answer to the lookup request. + +### Partitioned topics discovery + +Partitioned topics metadata discovery is used to find out if a topic is a +"partitioned topic" and how many partitions were set up. + +If the topic is marked as "partitioned", the client is expected to create +multiple producers or consumers, one for each partition, using the `partition-X` +suffix. + +This information only needs to be retrieved the first time a producer or +consumer is created. There is no need to do this after reconnections. + +The discovery of partitioned topics metadata works very similar to the topic +lookup. The client send a request to the service discovery address and the +response will contain actual metadata. + +##### Command PartitionedTopicMetadata + +```protobuf + +message CommandPartitionedTopicMetadata { + "topic" : "persistent://my-property/my-cluster/my-namespace/my-topic", + "request_id" : 1 +} + +``` + +Fields: + * `topic` → the topic for which to check the partitions metadata + * `request_id` → Id of the request that will be passed with its response + + +##### Command PartitionedTopicMetadataResponse + +Example of response with metadata: + +```protobuf + +message CommandPartitionedTopicMetadataResponse { + "request_id" : 1, + "response" : "Success", + "partitions" : 32 +} + +``` + +## Protobuf interface + +All Pulsar's Protobuf definitions can be found {@inject: github:here:/pulsar-common/src/main/proto/PulsarApi.proto}. diff --git a/site2/website/versioned_docs/version-2.9.x/functions-cli.md b/site2/website/versioned_docs/version-2.9.x/functions-cli.md new file mode 100644 index 0000000000000..c9fcfa201525f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-cli.md @@ -0,0 +1,198 @@ +--- +id: functions-cli +title: Pulsar Functions command line tool +sidebar_label: "Reference: CLI" +original_id: functions-cli +--- + +The following tables list Pulsar Functions command-line tools. You can learn Pulsar Functions modes, commands, and parameters. + +## localrun + +Run Pulsar Functions locally, rather than deploying it to the Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +broker-service-url | The URL for the Pulsar broker. | | +classname | The class name of a Pulsar Function.| | +client-auth-params | Client authentication parameter. | | +client-auth-plugin | Client authentication plugin using which function-process can connect to broker. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +hostname-verification-enabled | Enable hostname verification. | false +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +instance-id-offset | Start the instanceIds from this offset. | 0 +log-topic | The topic to which the logs a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +tls-allow-insecure | Allow insecure tls connection. | false +tls-trust-cert-path | tls trust cert file path. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +use-tls | Use tls connection. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + + +## create + +Create and deploy a Pulsar Function in cluster mode. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## delete + +Delete a Pulsar Function that is running on a Pulsar cluster. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## update + +Update a Pulsar Function that has been deployed to a Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | true | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime). | | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. This parameter is not supported in Python Functions. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package. | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +update-auth-data | Whether or not to update the auth data. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## get + +Fetch information about a Pulsar Function. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## restart + +Restart function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## stop + +Stops function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## start + +Starts a stopped function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | diff --git a/site2/website/versioned_docs/version-2.9.x/functions-debug.md b/site2/website/versioned_docs/version-2.9.x/functions-debug.md new file mode 100644 index 0000000000000..c1f19abda6465 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-debug.md @@ -0,0 +1,538 @@ +--- +id: functions-debug +title: Debug Pulsar Functions +sidebar_label: "How-to: Debug" +original_id: functions-debug +--- + +You can use the following methods to debug Pulsar Functions: + +* [Captured stderr](functions-debug.md#captured-stderr) +* [Use unit test](functions-debug.md#use-unit-test) +* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) +* [Use log topic](functions-debug.md#use-log-topic) +* [Use Functions CLI](functions-debug.md#use-functions-cli) + +## Captured stderr + +Function startup information and captured stderr output is written to `logs/functions////-.log` + +This is useful for debugging why a function fails to start. + +## Use unit test + +A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. + +For example, if you have the following Pulsar Function: + +```java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +You can write a simple unit test to test Pulsar Function. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} + +``` + +The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. + +:::tip + +Pulsar uses testng for testing. + +::: + +```java + +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} + +``` + +## Debug with localrun mode +When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. + +In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. + +:::note + +Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. + +::: + +You can launch your function in the following manner. + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); + +``` + +So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. + +The following example illustrates how to programmatically launch a function in localrun mode. + +```java + +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} + +``` + +To use localrun mode programmatically, add the following dependency. + +```xml + + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). + +:::note + +Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. + +::: + +## Use log topic + +In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +**Example** + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. + +**Example** + +```bash + +$ bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +The message published to log topic contains several properties for better reasoning: +- `loglevel` -- the level of the log message. +- `fqn` -- fully qualified function name pushes this log message. +- `instance` -- the ID of the function instance pushes this log message. + +## Use Functions CLI + +With [Pulsar Functions CLI](reference-pulsar-admin.md#functions), you can debug Pulsar Functions with the following subcommands: + +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +:::tip + +For complete commands of **Pulsar Functions CLI**, see [here](reference-pulsar-admin.md#functions)。 + +::: + +### `get` + +Get information about a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions get options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +:::tip + +`--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. + +::: + +**Example** + +You can specify `--fqfn` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 + +``` + +Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. + +```bash + +$ ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 + +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} + +``` + +### `status` + +Check the current status of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions status options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```json + +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} + +``` + +### `stats` + +Get the current stats of a Pulsar Function. + +**Usage** + +```bash + +$ pulsar-admin functions stats options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function.
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +The output is shown as follows: + +```json + +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} + +``` + +### `list` + +List all Pulsar Functions running under a specific tenant and namespace. + +**Usage** + +```bash + +$ pulsar-admin functions list options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + +As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. + +```text + +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 + +``` + +### `trigger` + +Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. + +**Usage** + +```bash + +$ pulsar-admin functions trigger options + +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. +|`--topic`|The topic name that a Pulsar Function consumes from. +|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. +|`--trigger-value`|The value to trigger a Pulsar Function. + +**Example** + +```bash + +$ ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" + +``` + +As shown below, the `trigger` command returns the following result: + +```text + +This is my function! + +``` + +:::note + +You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. + +```text + +Function in trigger function has unidentified topic +Reason: Function in trigger function has unidentified topic + +``` + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/functions-deploy.md b/site2/website/versioned_docs/version-2.9.x/functions-deploy.md new file mode 100644 index 0000000000000..2a0d68d6c623c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-deploy.md @@ -0,0 +1,262 @@ +--- +id: functions-deploy +title: Deploy Pulsar Functions +sidebar_label: "How-to: Deploy" +original_id: functions-deploy +--- + +## Requirements + +To deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: + +* You can run a [standalone cluster](getting-started-standalone.md) locally on your own machine. +* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), [DC/OS](https://dcos.io/), and more. + +If you run a non-[standalone](reference-terminology.md#standalone) cluster, you need to obtain the service URL for the cluster. How you obtain the service URL depends on how you deploy your Pulsar cluster. + +If you want to deploy and trigger Python user-defined functions, you need to install [the pulsar python client](http://pulsar.apache.org/docs/en/client-libraries-python/) on all the machines running [functions workers](functions-worker.md). + +## Command-line interface + +Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](reference-pulsar-admin.md#functions) interface, which contains commands such as [`create`](reference-pulsar-admin.md#functions-create) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](reference-pulsar-admin.md#trigger) for [triggering](#triggering-pulsar-functions) functions, [`list`](reference-pulsar-admin.md#list-2) for listing deployed functions. + +To learn more commands, refer to [`pulsar-admin functions`](reference-pulsar-admin.md#functions). + +### Default arguments + +When managing Pulsar Functions, you need to specify a variety of information about functions, including tenant, namespace, input and output topics, and so on. However, some parameters have default values if you do not specify values for them. The following table lists the default values. + +Parameter | Default +:---------|:------- +Function name | You can specify any value for the class name (except org, library, or similar class names). For example, when you specify the flag `--classname org.example.MyFunction`, the function name is `MyFunction`. +Tenant | Derived from names of the input topics. If the input topics are under the `marketing` tenant, which means the topic names have the form `persistent://marketing/{namespace}/{topicName}`, the tenant is `marketing`. +Namespace | Derived from names of the input topics. If the input topics are under the `asia` namespace under the `marketing` tenant, which means the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace is `asia`. +Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming`, and the function name is `exclamation`, then the name of the output topic is `incoming-exclamation-output`. +Subscription type | For `at-least-once` and `at-most-once` [processing guarantees](functions-overview.md#processing-guarantees), the [`SHARED`](concepts-messaging.md#shared) mode is applied by default; for `effectively-once` guarantees, the [`FAILOVER`](concepts-messaging.md#failover) mode is applied. +Processing guarantees | [`ATLEAST_ONCE`](functions-overview.md#processing-guarantees) +Pulsar service URL | `pulsar://localhost:6650` + +### Example of default arguments + +Take the `create` command as an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --classname org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 + +``` + +The function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +## Local run mode + +If you run a Pulsar Function in **local run** mode, it runs on the machine from which you enter the commands (on your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, and so on). The following is a [`localrun`](reference-pulsar-admin.md#localrun) command example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +By default, the function connects to a Pulsar cluster running on the same machine, via a local [broker](reference-terminology.md#broker) service URL of `pulsar://localhost:6650`. If you use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. The following is an example. + +```bash + +$ bin/pulsar-admin functions localrun \ + --broker-service-url pulsar://my-cluster-host:6650 \ + # Other function parameters + +``` + +## Cluster mode + +When you run a Pulsar Function in **cluster** mode, the function code is uploaded to a Pulsar broker and runs *alongside the broker* rather than in your [local environment](#local-run-mode). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#create-1) command. + +```bash + +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +### Update functions in cluster mode + +You can use the [`update`](reference-pulsar-admin.md#update-1) command to update a Pulsar Function running in cluster mode. The following command updates the function created in the [cluster mode](#cluster-mode) section. + +```bash + +$ bin/pulsar-admin functions update \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic + +``` + +### Parallelism + +Pulsar Functions run as processes or threads, which are called **instances**. When you run a Pulsar Function, it runs as a single instance by default. With one localrun command, you can only run a single instance of a function. If you want to run multiple instances, you can use localrun command multiple times. + +When you create a function, you can specify the *parallelism* of a function (the number of instances to run). You can set the parallelism factor using the `--parallelism` flag of the [`create`](reference-pulsar-admin.md#functions-create) command. + +```bash + +$ bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info + +``` + +You can adjust the parallelism of an already created function using the [`update`](reference-pulsar-admin.md#update-1) interface. + +```bash + +$ bin/pulsar-admin functions update \ + --parallelism 5 \ + # Other function + +``` + +If you specify a function configuration via YAML, use the `parallelism` parameter. The following is a config file example. + +```yaml + +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters + +``` + +The following is corresponding update command. + +```bash + +$ bin/pulsar-admin functions update \ + --function-config-file function-config.yaml + +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster mode](#cluster-mode), you can specify the resources that are assigned to each function [instance](#parallelism). + +Resource | Specified as | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Kubernetes +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +The following function creation command allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --classname org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 + +``` + +> #### Resources are *per instance* +> The resources that you apply to a given Pulsar Function are applied to each instance of the function. For example, if you apply 8 GB of RAM to a function with a parallelism of 5, you are applying 40 GB of RAM for the function in total. Make sure that you take the parallelism (the number of instances) factor into your resource calculations. + +### Use Package management service + +Package management enables version management and simplifies the upgrade and rollback processes for Functions, Sinks, and Sources. When you use the same function, sink and source in different namespaces, you can upload them to a common package management system. + +To use [Package management service](admin-api-packages.md), ensure that the package management service has been enabled in your cluster by setting the following properties in `broker.conf`. + +> Note: Package management service is not enabled by default. + +```yaml + +enablePackagesManagement=true +packagesManagementStorageProvider=org.apache.pulsar.packages.management.storage.bookkeeper.BookKeeperPackagesStorageProvider +packagesReplicas=1 +packagesManagementLedgerRootPath=/ledgers + +``` + +With Package management service enabled, you can upload your function packages by [upload a package](admin-api-packages.md#upload-a-package) to the service and get the [package URL](admin-api-packages.md#package-url). + +When you have a ready to use package URL, you can create the function with package URL by setting `--jar`, `--py`, or `--go` to the package URL with `pulsar-admin functions create`. + +## Trigger Pulsar Functions + +If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function output (if any) via the command line. + +> Triggering a function is to invoke a function by producing a message on one of the input topics. With the [`pulsar-admin functions trigger`](reference-pulsar-admin.md#trigger) command, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +To learn how to trigger a function, you can start with Python function that returns a simple string based on the input. + +```python + +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) + +``` + +You can run the function in [local run mode](functions-deploy.md#local-run-mode). + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --classname myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out + +``` + +Then assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. + +```bash + +$ bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription + --num-messages 0 # Listen indefinitely + +``` + +And then you can trigger the function. + +```bash + +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc \ + --trigger-value "hello world" + +``` + +The consumer listening on the output topic produces something as follows in the log. + +``` + +----- got message ----- +This function has been triggered with a value of hello world + +``` + +> #### Topic info is not required +> In the `trigger` command, you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you do not need to know the function input topics. diff --git a/site2/website/versioned_docs/version-2.9.x/functions-develop.md b/site2/website/versioned_docs/version-2.9.x/functions-develop.md new file mode 100644 index 0000000000000..2e29aa1c47400 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-develop.md @@ -0,0 +1,1600 @@ +--- +id: functions-develop +title: Develop Pulsar Functions +sidebar_label: "How-to: Develop" +original_id: functions-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +You learn how to develop Pulsar Functions with different APIs for Java, Python and Go. + +## Available APIs +In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). +Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). + +The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. + +````mdx-code-block + + + +```Java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + + + +```python + +def process(input): + return "{}!".format(input) + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +:::note + +You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. +If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to +start the functions. In this case, you can create a symlink. Your system will fail if +you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). + +```bash + +sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +``` + +::: + + + + +```` + +The following example uses Pulsar Functions SDK. +````mdx-code-block + + + +```Java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + + + +```python + +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + + + +```Go + +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/inputFunc/inputFunc.go#L20-L36). + + + + +```` + +## Schema registry +Pulsar has a built-in schema registry and is bundled with popular schema types, such as Avro, JSON and Protobuf. Pulsar Functions can leverage the existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. + +## SerDe +SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. + +````mdx-code-block + + + +When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: `String`, `Double`, `Integer`, `Float`, `Long`, `Short`, and `Byte`. + +To customize Java types, you need to implement the following interface. + +```java + +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} + +``` + +SerDe works in the following ways in Java Functions. +- If the input and output topics have schema, Pulsar Functions use schema for SerDe. +- If the input or output topics do not exist, Pulsar Functions adopt the following rules to determine SerDe: + - If the schema type is specified, Pulsar Functions use the specified schema type. + - If SerDe is specified, Pulsar Functions use the specified SerDe, and the schema type for input and output topics is `Byte`. + - If neither the schema type nor SerDe is specified, Pulsar Functions use the built-in SerDe. For non-primitive schema type, the built-in SerDe serializes and deserializes objects in the `JSON` format. + + + + +In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. + +You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. + +```bash + +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 + +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. +2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe. + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you work with simple types like strings, Booleans, integers. +`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. + + + + +Currently, the feature is not available in Go. + + + + +```` + +### Example +Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. + +````mdx-code-block + + + +```java + +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} + +``` + +To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java + +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} + +``` + +To apply this customized SerDe to a particular Pulsar Function, you need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of [`create`](reference-pulsar-admin.md#create-1) operation. + +```bash + +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes + +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. + + + + +```python + +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content + +``` + +In order to use this class in Pulsar Functions, you have two options: + +1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. +2. You can create your own SerDe class. The following is an example. + + ```python + + from pulsar import SerDe + + class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + + ``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). + + + + +```` + +In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Context +Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. + +* The name and ID of a Pulsar Function. +* The message ID of each message. Each Pulsar message is automatically assigned with an ID. +* The key, event time, properties and partition key of each message. +* The name of the topic to which the message is sent. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](#serde). +* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. +* The ID of the Pulsar Functions instance running the function. +* The version of the function. +* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. +* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. +* An interface for recording [metrics](#metrics). +* An interface for storing and retrieving state in [state storage](#state-storage). +* A function to publish new messages onto arbitrary topics. +* A function to ack the message being processed (if auto-ack is disabled). +* (Java) get Pulsar admin client. + +````mdx-code-block + + + +The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. + +```java + +public interface Context { + Record getCurrentRecord(); + Collection getInputTopics(); + String getOutputTopic(); + String getOutputSchemaType(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + void incrCounter(String key, long amount); + void incrCounterAsync(String key, long amount); + long getCounter(String key); + long getCounterAsync(String key); + void putState(String key, ByteBuffer value); + void putStateAsync(String key, ByteBuffer value); + void deleteState(String key); + ByteBuffer getState(String key); + ByteBuffer getStateAsync(String key); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + Object getUserConfigValueOrDefault(String key, Object defaultValue); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); + CompletableFuture publish(String topicName, O object); + TypedMessageBuilder newOutputMessage(String topicName, Schema schema) throws PulsarClientException; + ConsumerBuilder newConsumerBuilder(Schema schema) throws PulsarClientException; + PulsarAdmin getPulsarAdmin(); + PulsarAdmin getPulsarAdmin(String clusterName); +} + +``` + +The following example uses several methods available via the `Context` object. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} + +``` + + + + +``` + +class ContextImpl(pulsar.Context): + def get_message_id(self): + ... + def get_message_key(self): + ... + def get_message_eventtime(self): + ... + def get_message_properties(self): + ... + def get_current_message_topic_name(self): + ... + def get_partition_key(self): + ... + def get_function_name(self): + ... + def get_function_tenant(self): + ... + def get_function_namespace(self): + ... + def get_function_id(self): + ... + def get_instance_id(self): + ... + def get_function_version(self): + ... + def get_logger(self): + ... + def get_user_config_value(self, key): + ... + def get_user_config_map(self): + ... + def record_metric(self, metric_name, metric_value): + ... + def get_input_topics(self): + ... + def get_output_topic(self): + ... + def get_output_serde_class_name(self): + ... + def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", + properties=None, compression_type=None, callback=None, message_conf=None): + ... + def ack(self, msgid, topic): + ... + def get_and_reset_metrics(self): + ... + def reset_metrics(self): + ... + def get_metrics(self): + ... + def incr_counter(self, key, amount): + ... + def get_counter(self, key): + ... + def del_counter(self, key): + ... + def put_state(self, key, value): + ... + def get_state(self, key): + ... + +``` + + + + +``` + +func (c *FunctionContext) GetInstanceID() int { + return c.instanceConf.instanceID +} + +func (c *FunctionContext) GetInputTopics() []string { + return c.inputTopics +} + +func (c *FunctionContext) GetOutputTopic() string { + return c.instanceConf.funcDetails.GetSink().Topic +} + +func (c *FunctionContext) GetFuncTenant() string { + return c.instanceConf.funcDetails.Tenant +} + +func (c *FunctionContext) GetFuncName() string { + return c.instanceConf.funcDetails.Name +} + +func (c *FunctionContext) GetFuncNamespace() string { + return c.instanceConf.funcDetails.Namespace +} + +func (c *FunctionContext) GetFuncID() string { + return c.instanceConf.funcID +} + +func (c *FunctionContext) GetFuncVersion() string { + return c.instanceConf.funcVersion +} + +func (c *FunctionContext) GetUserConfValue(key string) interface{} { + return c.userConfigs[key] +} + +func (c *FunctionContext) GetUserConfMap() map[string]interface{} { + return c.userConfigs +} + +func (c *FunctionContext) SetCurrentRecord(record pulsar.Message) { + c.record = record +} + +func (c *FunctionContext) GetCurrentRecord() pulsar.Message { + return c.record +} + +func (c *FunctionContext) NewOutputMessage(topic string) pulsar.Producer { + return c.outputMessage(topic) +} + +``` + +The following example uses several methods available via the `Context` object. + +``` + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func contextFunc(ctx context.Context) { + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } +} + +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/contextFunc/contextFunc.go#L29-L34). + + + + +```` + +### User config +When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--user-config` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. + +```bash + +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --user-config '{"forbidden-word":"rosebud"}' + +``` + +````mdx-code-block + + + +The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Java function: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} + +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. + + + + +In Python function, you can access the configuration value like this. + +```python + +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input + +``` + +The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Python function: + +```python + +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) + +``` + + + + +The Go SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +$ bin/pulsar-admin functions create \ + --go path/to/go/binary + --user-config '{"word-of-the-day":"lackadaisical"}' + +``` + +To access that value in a Go function: + +```go + +func contextFunc(ctx context.Context) { + fc, ok := pf.FromContext(ctx) + if !ok { + logutil.Fatal("Function context is not defined") + } + + wotd := fc.GetUserConfValue("word-of-the-day") + + if wotd == nil { + logutil.Warn("The word of the day is empty") + } else { + logutil.Infof("The word of the day is %s", wotd.(string)) + } +} + +``` + + + + +```` + +### Logger + +````mdx-code-block + + + +Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + +#### Customize Function log level +Additionally, you can use the XML file, `functions_log4j2.xml`, to customize the function log level. +To customize the function log level, create or update `functions_log4j2.xml` in your Pulsar conf directory (for example, `/etc/pulsar/` on bare-metal, or `/pulsar/conf` on Kubernetes) to contain contents such as: + +```xml + + + pulsar-functions-instance + 30 + + + pulsar.log.appender + RollingFile + + + pulsar.log.level + debug + + + bk.log.level + debug + + + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + RollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.log + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}*log.gz + + + 30d + + + + + + BkRollingFile + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk + ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk-%d{MM-dd-yyyy}-%i.log.gz + true + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + 1 + true + + + 1 GB + + + 0 0 0 * * ? + + + + + ${sys:pulsar.function.log.dir} + 2 + + */${sys:pulsar.function.log.file}.bk*log.gz + + + 30d + + + + + + + + org.apache.pulsar.functions.runtime.shaded.org.apache.bookkeeper + ${sys:bk.log.level} + false + + BkRollingFile + + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + + + +``` + +The properties set like: + +```xml + + + pulsar.log.level + debug + + +``` + +propagate to places where they are referenced, such as: + +```xml + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + +``` + +In the above example, debug level logging would be applied to ALL function logs. +This may be more verbose than you desire. To be more selective, you can apply different log levels to different classes or modules. For example: + +```xml + + + com.example.module + info + false + + ${sys:pulsar.log.appender} + + + +``` + +You can be more specific as well, such as applying a more verbose log level to a class in the module, such as: + +```xml + + + com.example.module.className + debug + false + + Console + + + +``` + +Each `` entry allows you to output the log to a target specified in the definition of the Appender. + +Additivity pertains to whether log messages will be duplicated if multiple Logger entries overlap. +To disable additivity, specify + +```xml + +false + +``` + +as shown in examples above. Disabling additivity prevents duplication of log messages when one or more `` entries contain classes or modules that overlap. + +The `` is defined in the `` section, such as: + +```xml + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + +``` + + + + +Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python + +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) + +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. + +```bash + +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. +Additionally, you can specify the function log level through the broker XML file as described in [Customize Function log level](#customize-function-log-level). + + + + +The following Go Function example shows different log levels based on the function input. + +``` + +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/pf" + + log "github.com/apache/pulsar/pulsar-function-go/logutil" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} + +``` + +When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/logutil`, and you do not have to use the `getLogger()` context object. + +Additionally, you can specify the function log level through the broker XML file, as described here: [Customize Function log level](#customize-function-log-level) + + + + +```` + +### Pulsar admin + +Pulsar Functions using the Java SDK has access to the Pulsar admin client, which allows the Pulsar admin client to manage API calls to current Pulsar clusters or external clusters (if `external-pulsars` is provided). + +````mdx-code-block + + + +Below is an example of how to use the Pulsar admin client exposed from the Function `context`. + +``` + +import org.apache.pulsar.client.admin.PulsarAdmin; +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +/** + * In this particular example, for every input message, + * the function resets the cursor of the current function's subscription to a + * specified timestamp. + */ +public class CursorManagementFunction implements Function { + + @Override + public String process(String input, Context context) throws Exception { + PulsarAdmin adminClient = context.getPulsarAdmin(); + if (adminClient != null) { + String topic = context.getCurrentRecord().getTopicName().isPresent() ? + context.getCurrentRecord().getTopicName().get() : null; + String subName = context.getTenant() + "/" + context.getNamespace() + "/" + context.getFunctionName(); + if (topic != null) { + // 1578188166 below is a random-pick timestamp + adminClient.topics().resetCursor(topic, subName, 1578188166); + return "reset cursor successfully"; + } + } + return null; + } +} + +``` + +If you want your function to get access to the Pulsar admin client, you need to enable this feature by setting `exposeAdminClientEnabled=true` in the `functions_worker.yml` file. You can test whether this feature is enabled or not using the command `pulsar-admin functions localrun` with the flag `--web-service-url`. + +``` + +$ bin/pulsar-admin functions localrun \ + --jar my-functions.jar \ + --classname my.package.CursorManagementFunction \ + --web-service-url http://pulsar-web-service:8080 \ + # Other function configs + +``` + + + + +```` + +## Metrics + +Pulsar Functions allows you to deploy and manage processing functions that consume messages from and publish messages to Pulsar topics easily. It is important to ensure that the running functions are healthy at any time. Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can monitor Pulsar Functions that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar Functions expose the metrics that can be collected and used for monitoring the health of **Java, Python, and Go** functions. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + + For the complete list of the function metrics, see [here](reference-metrics.md#pulsar-functions). + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java and Python** functions. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here are examples of how to customize metrics for Java and Python functions. + +````mdx-code-block + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} + +``` + + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. + +```python + +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) + +``` + + + + +Currently, the feature is not available in Go. + + + + +```` + +## Security + +If you want to enable security on Pulsar Functions, first you should enable security on [Functions Workers](functions-worker.md). For more details, refer to [Security settings](functions-worker.md#security-settings). + +Pulsar Functions can support the following providers: + +- ClearTextSecretsProvider +- EnvironmentBasedSecretsProvider + +> Pulsar Function supports ClearTextSecretsProvider by default. + +At the same time, Pulsar Functions provides two interfaces, **SecretsProvider** and **SecretsProviderConfigurator**, allowing users to customize secret provider. + +````mdx-code-block + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class GetSecretProviderFunction implements Function { + + @Override + public Void process(String input, Context context) throws Exception { + Logger LOG = context.getLogger(); + String secretProvider = context.getSecret(input); + + if (!secretProvider.isEmpty()) { + LOG.info("The secret provider is {}", secretProvider); + } else { + LOG.warn("No secret provider"); + } + + return null; + } +} + +``` + + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```python + +from pulsar import Function + +class GetSecretProviderFunction(Function): + def process(self, input, context): + logger = context.get_logger() + secret_provider = context.get_secret(input) + if secret_provider is None: + logger.warn('No secret provider') + else: + logger.info("The secret provider is {0}".format(secret_provider)) + +``` + + + + +Currently, the feature is not available in Go. + + + + +```` + +## State storage +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. + +Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. + +You can access states within Pulsar Java Functions using the `putState`, `putStateAsync`, `getState`, `getStateAsync`, `incrCounter`, `incrCounterAsync`, `getCounter`, `getCounterAsync` and `deleteState` calls on the context object. You can access states within Pulsar Python Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. + +:::note + +State storage is not available in Go. + +::: + +### API + +````mdx-code-block + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. + +#### incrCounter + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +The application can use `incrCounter` to change the counter of a given `key` by the given `amount`. + +#### incrCounterAsync + +```java + + /** + * Increment the builtin distributed counter referred by key + * but dont wait for the completion of the increment operation + * + * @param key The name of the key + * @param amount The amount to be incremented + */ + CompletableFuture incrCounterAsync(String key, long amount); + +``` + +The application can use `incrCounterAsync` to asynchronously change the counter of a given `key` by the given `amount`. + +#### getCounter + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +The application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### getCounterAsync + +```java + + /** + * Retrieve the counter value for the key, but don't wait + * for the operation to be completed + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + CompletableFuture getCounterAsync(String key); + +``` + +The application can use `getCounterAsync` to asynchronously retrieve the counter of a given `key` mutated by `incrCounterAsync`. + +#### putState + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +#### putStateAsync + +```java + + /** + * Update the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @param value state value of the key + */ + CompletableFuture putStateAsync(String key, ByteBuffer value); + +``` + +The application can use `putStateAsync` to asynchronously update the state of a given `key`. + +#### getState + +```java + + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); + +``` + +#### getStateAsync + +```java + + /** + * Retrieve the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @return the state value for the key. + */ + CompletableFuture getStateAsync(String key); + +``` + +The application can use `getStateAsync` to asynchronously retrieve the state of a given `key`. + +#### deleteState + +```java + + /** + * Delete the state value for the key. + * + * @param key name of the key + */ + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. + +#### incr_counter + +```python + + def incr_counter(self, key, amount): + ""incr the counter of a given key in the managed state"" + +``` + +Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +#### get_counter + +```python + + def get_counter(self, key): + """get the counter of a given key in the managed state""" + +``` + +Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### put_state + +```python + + def put_state(self, key, value): + """update the value of a given key in the managed state""" + +``` + +The key is a string, and the value is arbitrary binary data. + +#### get_state + +```python + + def get_state(self, key): + """get the value of a given key in the managed state""" + +``` + +#### del_counter + +```python + + def del_counter(self, key): + """delete the counter of a given key in the managed state""" + +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + + +```` + +### Query State + +A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage +and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides +CLI commands for querying its state. + +```shell + +$ bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] + +``` + +If `--watch` is specified, the CLI will watch the value of the provided `state-key`. + +### Example + +````mdx-code-block + + + +{@inject: github:WordCountFunction:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example +demonstrating on how Application can easily store `state` in Pulsar Functions. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). + + + + +```python + +from pulsar import Function + +class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) + +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received string into multiple words on space. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/functions-metrics.md b/site2/website/versioned_docs/version-2.9.x/functions-metrics.md new file mode 100644 index 0000000000000..8add669316092 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-metrics.md @@ -0,0 +1,7 @@ +--- +id: functions-metrics +title: Metrics for Pulsar Functions +sidebar_label: "Metrics" +original_id: functions-metrics +--- + diff --git a/site2/website/versioned_docs/version-2.9.x/functions-overview.md b/site2/website/versioned_docs/version-2.9.x/functions-overview.md new file mode 100644 index 0000000000000..816d301e0fd0e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-overview.md @@ -0,0 +1,209 @@ +--- +id: functions-overview +title: Pulsar Functions overview +sidebar_label: "Overview" +original_id: functions-overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic. + + +## Goals +With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://heron.incubator.apache.org/), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: + +* Developer productivity (language-native vs Pulsar Functions SDK functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations +Pulsar Functions are inspired by (and take cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions can be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus. + +## Programming model +Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. + + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](functions-develop.md#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#word-count-example) + +![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) + +You can use Pulsar Functions to set up the following processing chain: + +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. + + +### Word count example + +If you implement the classic word count example using Pulsar Functions, it looks something like this: + +![Pulsar Functions word count example](/assets/pulsar-functions-word-count.png) + +To write the function in Java with [Pulsar Functions SDK for Java](functions-develop.md#available-apis), you can write the function as follows. + +```java + +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } +} + +``` + +Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count + +``` + +### Content-based routing example + +Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. + +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. + +![Pulsar Functions routing example](/assets/pulsar-functions-routing-example.png) + +If you implement this routing functionality in Python, it looks something like this: + +```python + +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + @staticmethod + def is_fruit(item): + return item in [b"apple", b"orange", b"pear", b"other fruits..."] + + @staticmethod + def is_vegetable(item): + return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) + +``` + +If this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash + +$ bin/pulsar-admin functions create \ + --py ~/router.py \ + --classname router.RoutingFunction \ + --tenant public \ + --namespace default \ + --name route-fruit-veg \ + --inputs persistent://public/default/basket-items + +``` + +### Functions, messages and message types +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. +* [Schema Registry](functions-develop.md#schema-registry) +* [SerDe](functions-develop.md#serde) + + +## Fully Qualified Function Name (FQFN) +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: + +```http + +tenant/namespace/name + +``` + +FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. + +## Supported languages +Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop.md). + +## Processing guarantees +Pulsar Functions provide three different messaging semantics that you can apply to any function. + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). +**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). +**Effectively-once** delivery | Each message sent to the function will have one output associated with it. + + +### Apply processing guarantees to a function +You can set the processing guarantees for a Pulsar Function when you create the Function. The following [`pulsar-function create`](reference-pulsar-admin.md#create-1) command creates a function with effectively-once guarantees applied. + +```bash + +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs + +``` + +The available options for `--processing-guarantees` are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. + +### Update the processing guarantees of a function +You can change the processing guarantees applied to a function using the [`update`](reference-pulsar-admin.md#update-1) command. The following is an example. + +```bash + +$ bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/functions-package.md b/site2/website/versioned_docs/version-2.9.x/functions-package.md new file mode 100644 index 0000000000000..db2c4e987dc7b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-package.md @@ -0,0 +1,493 @@ +--- +id: functions-package +title: Package Pulsar Functions +sidebar_label: "How-to: Package" +original_id: functions-package +--- + +You can package Pulsar functions in Java, Python, and Go. Packaging the window function in Java is the same as [packaging a function in Java](#java). + +:::note + +Currently, the window function is not available in Python and Go. + +::: + +## Prerequisite + +Before running a Pulsar function, you need to start Pulsar. You can [run a standalone Pulsar in Docker](getting-started-docker.md), or [run Pulsar in Kubernetes](getting-started-helm.md). + +To check whether the Docker image starts, you can use the `docker ps` command. + +## Java + +To package a function in Java, complete the following steps. + +1. Create a new maven project with a pom file. In the following code sample, the value of `mainClass` is your package name. + + ```Java + + + + 4.0.0 + + java-function + java-function + 1.0-SNAPSHOT + + + + org.apache.pulsar + pulsar-functions-api + 2.6.0 + + + + + + + maven-assembly-plugin + + false + + jar-with-dependencies + + + + org.example.test.ExclamationFunction + + + + + + make-assembly + package + + assembly + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + + ``` + +2. Write a Java function. + + ``` + + package org.example.test; + + import java.util.function.Function; + + public class ExclamationFunction implements Function { + @Override + public String apply(String s) { + return "This is my function!"; + } + } + + ``` + + For the imported package, you can use one of the following interfaces: + - Function interface provided by Java 8: `java.util.function.Function` + - Pulsar Function interface: `org.apache.pulsar.functions.api.Function` + + The main difference between the two interfaces is that the `org.apache.pulsar.functions.api.Function` interface provides the context interface. When you write a function and want to interact with it, you can use context to obtain a wide variety of information and functionality for Pulsar Functions. + + The following example uses `org.apache.pulsar.functions.api.Function` interface with context. + + ``` + + package org.example.functions; + import org.apache.pulsar.functions.api.Context; + import org.apache.pulsar.functions.api.Function; + + import java.util.Arrays; + public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } + } + + ``` + +3. Package the Java function. + + ```bash + + mvn package + + ``` + + After the Java function is packaged, a `target` directory is created automatically. Open the `target` directory to check if there is a JAR package similar to `java-function-1.0-SNAPSHOT.jar`. + + +4. Run the Java function. + + (1) Copy the packaged jar file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Java function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname org.example.test.ExclamationFunction \ + --jar java-function-1.0-SNAPSHOT.jar \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name JavaFunction + + ``` + + The following log indicates that the Java function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Python + +Python Function supports the following three formats: + +- One python file +- ZIP file +- PIP + +### One python file + +To package a function with **one python file** in Python, complete the following steps. + +1. Write a Python function. + + ``` + + from pulsar import Function // import the Function module from Pulsar + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + + ``` + + In this example, when you write a Python function, you need to inherit the Function class and implement the `process()` method. + + `process()` mainly has two parameters: + + - `input` represents your input. + + - `context` represents an interface exposed by the Pulsar Function. You can get the attributes in the Python function based on the provided context object. + +2. Install a Python client. + + The implementation of a Python function depends on the Python client, so before deploying a Python function, you need to install the corresponding version of the Python client. + + ```bash + + pip install python-client==2.6.0 + + ``` + +3. Run the Python Function. + + (1) Copy the Python function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname org.example.test.ExclamationFunction \ + --py \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### ZIP file + +To package a function with the **ZIP file** in Python, complete the following steps. + +1. Prepare the ZIP file. + + The following is required when packaging the ZIP file of the Python Function. + + ```text + + Assuming the zip file is named as `func.zip`, unzip the `func.zip` folder: + "func/src" + "func/requirements.txt" + "func/deps" + + ``` + + Take [exclamation.zip](https://github.com/apache/pulsar/tree/master/tests/docker-images/latest-version-image/python-examples) as an example. The internal structure of the example is as follows. + + ```text + + . + ├── deps + │   └── sh-1.12.14-py2.py3-none-any.whl + └── src + └── exclamation.py + + ``` + +2. Run the Python Function. + + (1) Copy the ZIP file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname exclamation \ + --py \ + --inputs persistent://public/default/in-topic \ + --output persistent://public/default/out-topic \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +### PIP + +The PIP method is only supported in Kubernetes runtime. To package a function with **PIP** in Python, complete the following steps. + +1. Configure the `functions_worker.yml` file. + + ```text + + #### Kubernetes Runtime #### + installUserCodeDependencies: true + + ``` + +2. Write your Python Function. + + ``` + + from pulsar import Function + import js2xml + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + // add your logic + return input + '!' + + ``` + + You can introduce additional dependencies. When Python Function detects that the file currently used is `whl` and the `installUserCodeDependencies` parameter is specified, the system uses the `pip install` command to install the dependencies required in Python Function. + +3. Generate the `whl` file. + + ```shell script + + $ cd $PULSAR_HOME/pulsar-functions/scripts/python + $ chmod +x generate.sh + $ ./generate.sh + # e.g: ./generate.sh /path/to/python /path/to/python/output 1.0.0 + + ``` + + The output is written in `/path/to/python/output`: + + ```text + + -rw-r--r-- 1 root staff 1.8K 8 27 14:29 pulsarfunction-1.0.0-py2-none-any.whl + -rw-r--r-- 1 root staff 1.4K 8 27 14:29 pulsarfunction-1.0.0.tar.gz + -rw-r--r-- 1 root staff 0B 8 27 14:29 pulsarfunction.whl + + ``` + +## Go + +To package a function in Go, complete the following steps. + +1. Write a Go function. + + Currently, Go function can be **only** implemented using SDK and the interface of the function is exposed in the form of SDK. Before using the Go function, you need to import "github.com/apache/pulsar/pulsar-function-go/pf". + + ``` + + import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" + ) + + func HandleRequest(ctx context.Context, input []byte) error { + fmt.Println(string(input) + "!") + return nil + } + + func main() { + pf.Start(HandleRequest) + } + + ``` + + You can use context to connect to the Go function. + + ``` + + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } + + ``` + + When writing a Go function, remember that + - In `main()`, you **only** need to register the function name to `Start()`. **Only** one function name is received in `Start()`. + - Go function uses Go reflection, which is based on the received function name, to verify whether the parameter list and returned value list are correct. The parameter list and returned value list **must be** one of the following sample functions: + + ``` + + func () + func () error + func (input) error + func () (output, error) + func (input) (output, error) + func (context.Context) error + func (context.Context, input) error + func (context.Context) (output, error) + func (context.Context, input) (output, error) + + ``` + +2. Build the Go function. + + ``` + + go build .go + + ``` + +3. Run the Go Function. + + (1) Copy the Go function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + (2) Run the Go function with the following command. + + ``` + + ./bin/pulsar-admin functions localrun \ + --go [your go function path] + --inputs [input topics] \ + --output [output topic] \ + --tenant [default:public] \ + --namespace [default:default] \ + --name [custom unique go function name] + + ``` + + The following log indicates that the Go function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## Start Functions in cluster mode +If you want to start a function in cluster mode, replace `localrun` with `create` in the commands above. The following log indicates that your function starts successfully. + + ```text + + "Created successfully" + + ``` + +For information about parameters on `--classname`, `--jar`, `--py`, `--go`, `--inputs`, run the command `./bin/pulsar-admin functions` or see [here](reference-pulsar-admin.md#functions). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/functions-runtime.md b/site2/website/versioned_docs/version-2.9.x/functions-runtime.md new file mode 100644 index 0000000000000..7164bd13668af --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-runtime.md @@ -0,0 +1,403 @@ +--- +id: functions-runtime +title: Configure Functions runtime +sidebar_label: "Setup: Configure Functions runtime" +original_id: functions-runtime +--- + +You can use the following methods to run functions. + +- *Thread*: Invoke functions threads in functions worker. +- *Process*: Invoke functions in processes forked by functions worker. +- *Kubernetes*: Submit functions as Kubernetes StatefulSets by functions worker. + +:::note + +Pulsar supports adding labels to the Kubernetes StatefulSets and services while launching functions, which facilitates selecting the target Kubernetes objects. + +::: + +The differences of the thread and process modes are: +- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with functions worker. +- Process mode: when a function runs in process mode, it runs on the same machine that functions worker runs. + +## Configure thread runtime +It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.thread.ThreadRuntimeFactory +functionRuntimeFactoryConfigs: + threadGroupName: "Your Function Container Group" + +``` + +*Thread* runtime is only supported in Java function. + +## Configure process runtime +When you enable *Process* runtime, you do not need to configure anything. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.process.ProcessRuntimeFactory +functionRuntimeFactoryConfigs: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: + +``` + +*Process* runtime is supported in Java, Python, and Go functions. + +## Configure Kubernetes runtime + +When the functions worker generates Kubernetes manifests and apply the manifests, the Kubernetes runtime works. If you have run functions worker on Kubernetes, you can use the `serviceAccount` associated with the pod that the functions worker is running in. Otherwise, you can configure it to communicate with a Kubernetes cluster. + +The manifests, generated by the functions worker, include a `StatefulSet`, a `Service` (used to communicate with the pods), and a `Secret` for auth credentials (when applicable). The `StatefulSet` manifest (by default) has a single pod, with the number of replicas determined by the "parallelism" of the function. On pod boot, the pod downloads the function payload (via the functions worker REST API). The pod's container image is configurable, but must have the functions runtime. + +The Kubernetes runtime supports secrets, so you can create a Kubernetes secret and expose it as an environment variable in the pod. The Kubernetes runtime is extensible, you can implement classes and customize the way how to generate Kubernetes manifests, how to pass auth data to pods, and how to integrate secrets. + +:::tip + +For the rules of translating Pulsar object names into Kubernetes resource labels, see [here](admin-api-overview.md#how-to-define-pulsar-resource-names-when-running-pulsar-in-kubernetes). + +::: + +### Basic configuration + +It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.kubernetes.KubernetesRuntimeFactory +functionRuntimeFactoryConfigs: + # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker + k8Uri: + # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty + jobNamespace: + # The Kubernetes pod name to run the function instances. It is set to + # `pf----` if this setting is left to be empty + jobName: + # the docker image to run function instance. by default it is `apachepulsar/pulsar` + pulsarDockerImageName: + # the docker image to run function instance according to different configurations provided by users. + # By default it is `apachepulsar/pulsar`. + # e.g: + # functionDockerImages: + # JAVA: JAVA_IMAGE_NAME + # PYTHON: PYTHON_IMAGE_NAME + # GO: GO_IMAGE_NAME + functionDockerImages: + # "The image pull policy for image used to run function instance. By default it is `IfNotPresent` + imagePullPolicy: IfNotPresent + # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. + # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly + pulsarRootDir: + # The config admin CLI allows users to customize the configuration of the admin cli tool, such as: + # `/bin/pulsar-admin and /bin/pulsarctl`. By default it is `/bin/pulsar-admin`. If you want to use `pulsarctl` + # you need to set this setting accordingly + configAdminCLI: + # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, + # setting this to true is let function worker to submit functions to the same k8s cluster as function worker + # is running. setting this to false if your function worker is not running as a k8 pod. + submittingInsidePod: false + # setting the pulsar service url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar service url configured in worker service + pulsarServiceUrl: + # setting the pulsar admin url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar admin url configured in worker service + pulsarAdminUrl: + # The flag indicates to install user code dependencies. (applied to python package) + installUserCodeDependencies: + # The repository that pulsar functions use to download python dependencies + pythonDependencyRepository: + # The repository that pulsar functions use to download extra python dependencies + pythonExtraDependencyRepository: + # the custom labels that function worker uses to select the nodes for pods + customLabels: + # The expected metrics collection interval, in seconds + expectedMetricsCollectionInterval: 30 + # Kubernetes Runtime will periodically checkback on + # this configMap if defined and if there are any changes + # to the kubernetes specific stuff, we apply those changes + changeConfigMap: + # The namespace for storing change config map + changeConfigMapNamespace: + # The ratio cpu request and cpu limit to be set for a function/source/sink. + # The formula for cpu request is cpuRequest = userRequestCpu / cpuOverCommitRatio + cpuOverCommitRatio: 1.0 + # The ratio memory request and memory limit to be set for a function/source/sink. + # The formula for memory request is memoryRequest = userRequestMemory / memoryOverCommitRatio + memoryOverCommitRatio: 1.0 + # The port inside the function pod which is used by the worker to communicate with the pod + grpcPort: 9093 + # The port inside the function pod on which prometheus metrics are exposed + metricsPort: 9094 + # The directory inside the function pod where nar packages will be extracted + narExtractionDirectory: + # The classpath where function instance files stored + functionInstanceClassPath: + # the directory for dropping extra function dependencies + # if it is not an absolute path, it is relative to `pulsarRootDir` + extraFunctionDependenciesDir: + # Additional memory padding added on top of the memory requested by the function per on a per instance basis + percentMemoryPadding: 10 + # The duration (in seconds) before the StatefulSet is deleted after a function stops or restarts. + # Value must be a non-negative integer. 0 indicates the StatefulSet is deleted immediately. + # Default is 5 seconds. + gracePeriodSeconds: 5 + +``` + +If you run functions worker embedded in a broker on Kubernetes, you can use the default settings. + +### Run standalone functions worker on Kubernetes + +If you run functions worker standalone (that is, not embedded) on Kubernetes, you need to configure `pulsarSerivceUrl` to be the URL of the broker and `pulsarAdminUrl` as the URL to the functions worker. + +For example, both Pulsar brokers and Function Workers run in the `pulsar` K8S namespace. The brokers have a service called `brokers` and the functions worker has a service called `func-worker`. The settings are as follows: + +```yaml + +pulsarServiceUrl: pulsar://broker.pulsar:6650 // or pulsar+ssl://broker.pulsar:6651 if using TLS +pulsarAdminUrl: http://func-worker.pulsar:8080 // or https://func-worker:8443 if using TLS + +``` + +### Run RBAC in Kubernetes clusters + +If you run RBAC in your Kubernetes cluster, make sure that the service account you use for running functions workers (or brokers, if functions workers run along with brokers) have permissions on the following Kubernetes APIs. + +- services +- configmaps +- pods +- apps.statefulsets + +The following is sufficient: + +```yaml + +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: functions-worker +rules: +- apiGroups: [""] + resources: + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: functions-worker +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: functions-worker +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: functions-worker +subjectsKubernetesSec: +- kind: ServiceAccount + name: functions-worker + +``` + +If the service-account is not properly configured, an error message similar to this is displayed: + +```bash + +22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar +io.kubernetes.client.ApiException: Forbidden + at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] + at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] + +``` + +### Integrate Kubernetes secrets + +In order to safely distribute secrets, Pulsar Functions can reference Kubernetes secrets. To enable this, set the `secretsProviderConfiguratorClassName` to `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator`. + +You can create a secret in the namespace where your functions are deployed. For example, you deploy functions to the `pulsar-func` Kubernetes namespace, and you have a secret named `database-creds` with a field name `password`, which you want to mount in the pod as an environment variable called `DATABASE_PASSWORD`. The following functions configuration enables you to reference that secret and mount the value as an environment variable in the pod. + +```Yaml + +tenant: "mytenant" +namespace: "mynamespace" +name: "myfunction" +topicName: "persistent://mytenant/mynamespace/myfuncinput" +className: "com.company.pulsar.myfunction" + +secrets: + # the secret will be mounted from the `password` field in the `database-creds` secret as an env var called `DATABASE_PASSWORD` + DATABASE_PASSWORD: + path: "database-creds" + key: "password" + +``` + +### Enable token authentication + +When you enable authentication for your Pulsar cluster, you need a mechanism for the pod running your function to authenticate with the broker. + +The `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface provides support for any authentication mechanism. The `functionAuthProviderClassName` in `function-worker.yml` is used to specify your path to this implementation. + +Pulsar includes an implementation of this interface for token authentication, and distributes the certificate authority via the same implementation. The configuration is similar as follows: + +```Yaml + +functionAuthProviderClassName: org.apache.pulsar.functions.auth.KubernetesSecretsTokenAuthProvider + +``` + +For token authentication, the functions worker captures the token that is used to deploy (or update) the function. The token is saved as a secret and mounted into the pod. + +For custom authentication or TLS, you need to implement this interface or use an alternative mechanism to provide authentication. If you use token authentication and TLS encryption to secure the communication with the cluster, Pulsar passes your certificate authority (CA) to the client, so the client obtains what it needs to authenticate the cluster, and trusts the cluster with your signed certificate. + +:::note + +If you use tokens that expire when deploying functions, these tokens will expire. + +::: + +### Run clusters with authentication + +When you run a functions worker in a standalone process (that is, not embedded in the broker) in a cluster with authentication, you must configure your functions worker to interact with the broker and authenticate incoming requests. So you need to configure properties that the broker requires for authentication or authorization. + +For example, if you use token authentication, you need to configure the following properties in the `function-worker.yml` file. + +```Yaml + +clientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationToken +clientAuthenticationParameters: file:///etc/pulsar/token/admin-token.txt +configurationStoreServers: zookeeper-cluster:2181 # auth requires a connection to zookeeper +authenticationProviders: + - "org.apache.pulsar.broker.authentication.AuthenticationProviderToken" +authorizationEnabled: true +authenticationEnabled: true +superUserRoles: + - superuser + - proxy +properties: + tokenSecretKey: file:///etc/pulsar/jwt/secret # if using a secret token, key file must be DER-encoded + tokenPublicKey: file:///etc/pulsar/jwt/public.key # if using public/private key tokens, key file must be DER-encoded + +``` + +:::note + +You must configure both the Function Worker authorization or authentication for the server to authenticate requests and configure the client to be authenticated to communicate with the broker. + +::: + +### Customize Kubernetes runtime + +The Kubernetes integration enables you to implement a class and customize how to generate manifests. You can configure it by setting `runtimeCustomizerClassName` in the `functions-worker.yml` file and use the fully qualified class name. You must implement the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer` interface. + +The functions (and sinks/sources) API provides a flag, `customRuntimeOptions`, which is passed to this interface. + +To initialize the `KubernetesManifestCustomizer`, you can provide `runtimeCustomizerConfig` in the `functions-worker.yml` file. `runtimeCustomizerConfig` is passed to the `public void initialize(Map config)` function of the interface. `runtimeCustomizerConfig`is different from the `customRuntimeOptions` as `runtimeCustomizerConfig` is the same across all functions. If you provide both `runtimeCustomizerConfig` and `customRuntimeOptions`, you need to decide how to manage these two configurations in your implementation of `KubernetesManifestCustomizer`. + +Pulsar includes a built-in implementation. To use the basic implementation, set `runtimeCustomizerClassName` to `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer`. The built-in implementation initialized with `runtimeCustomizerConfig` enables you to pass a JSON document as `customRuntimeOptions` with certain properties to augment, which decides how the manifests are generated. If both `runtimeCustomizerConfig` and `customRuntimeOptions` are provided, `BasicKubernetesManifestCustomizer` uses `customRuntimeOptions` to override the configuration if there are conflicts in these two configurations. + +Below is an example of `customRuntimeOptions`. + +```json + +{ + "jobName": "jobname", // the k8s pod name to run this function instance + "jobNamespace": "namespace", // the k8s namespace to run this function in + "extractLabels": { // extra labels to attach to the statefulSet, service, and pods + "extraLabel": "value" + }, + "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods + "extraAnnotation": "value" + }, + "nodeSelectorLabels": { // node selector labels to add on to the pod spec + "customLabel": "value" + }, + "tolerations": [ // tolerations to add to the pod spec + { + "key": "custom-key", + "value": "value", + "effect": "NoSchedule" + } + ], + "resourceRequirements": { // values for cpu and memory should be defined as described here: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container + "requests": { + "cpu": 1, + "memory": "4G" + }, + "limits": { + "cpu": 2, + "memory": "8G" + } + } +} + +``` + +## Run clusters with geo-replication + +If you run multiple clusters tied together with geo-replication, it is important to use a different function namespace for each cluster. Otherwise, the function shares a namespace and potentially schedule across clusters. + +For example, if you have two clusters: `east-1` and `west-1`, you can configure the functions workers for `east-1` and `west-1` perspectively as follows. + +```Yaml + +pulsarFunctionsCluster: east-1 +pulsarFunctionsNamespace: public/functions-east-1 + +``` + +```Yaml + +pulsarFunctionsCluster: west-1 +pulsarFunctionsNamespace: public/functions-west-1 + +``` + +This ensures the two different Functions Workers use distinct sets of topics for their internal coordination. + +## Configure standalone functions worker + +When configuring a standalone functions worker, you need to configure properties that the broker requires, especially if you use TLS. And then Functions Worker can communicate with the broker. + +You need to configure the following required properties. + +```Yaml + +workerPort: 8080 +workerPortTls: 8443 # when using TLS +tlsCertificateFilePath: /etc/pulsar/tls/tls.crt # when using TLS +tlsKeyFilePath: /etc/pulsar/tls/tls.key # when using TLS +tlsTrustCertsFilePath: /etc/pulsar/tls/ca.crt # when using TLS +pulsarServiceUrl: pulsar://broker.pulsar:6650/ # or pulsar+ssl://pulsar-prod-broker.pulsar:6651/ when using TLS +pulsarWebServiceUrl: http://broker.pulsar:8080/ # or https://pulsar-prod-broker.pulsar:8443/ when using TLS +useTls: true # when using TLS, critical! + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/functions-worker.md b/site2/website/versioned_docs/version-2.9.x/functions-worker.md new file mode 100644 index 0000000000000..49fc76b30bdaa --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/functions-worker.md @@ -0,0 +1,386 @@ +--- +id: functions-worker +title: Deploy and manage functions worker +sidebar_label: "Setup: Pulsar Functions Worker" +original_id: functions-worker +--- +Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime.md). + +Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either based on your requirements. +- [run with brokers](#run-functions-worker-with-brokers) +- [run it separately](#run-functions-worker-separately) in a different broker + +:::note + +The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. + +::: + +## Run Functions-worker with brokers + +The following diagram illustrates the deployment of functions-workers running along with brokers. + +![assets/functions-worker-corun.png](/assets/functions-worker-corun.png) + +To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. + +```conf + +functionsWorkerEnabled=true + +``` + +If the `functionsWorkerEnabled` is set to `true`, the functions-worker is started as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. + +Before you run Functions-worker with broker, you have to configure Functions-worker, and then start it with brokers. + +### Configure Functions-Worker to run with brokers +In this mode, most of the settings are already inherited from your broker configuration (for example, configurationStore settings, authentication settings, and so on) since `functions-worker` is running as part of the broker. + +Pay attention to the following required settings when configuring functions-worker in this mode. + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be larger than `2`. +- `initializedDlogMetadata`: Whether to initialize distributed log metadata in runtime. If it is set to `true`, you must ensure that it has been initialized by `bin/pulsar initialize-cluster-metadata` command. + +If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. + +- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. +- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. +- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. + +### Configure Stateful-Functions to run with broker + +If you want to use Stateful-Functions related functions (for example, `putState()` and `queryState()` related interfaces), follow steps below. + +1. Enable the **streamStorage** service in the BookKeeper. + + Currently, the service uses the NAR package, so you need to set the configuration in `bookkeeper.conf`. + + ```text + + extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent + + ``` + + After starting bookie, use the following methods to check whether the streamStorage service is started correctly. + + Input: + + ```shell + + telnet localhost 4181 + + ``` + + Output: + + ```text + + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + + ``` + +2. Turn on this function in `functions_worker.yml`. + + ```text + + stateStorageServiceUrl: bk://:4181 + + ``` + + `bk-service-url` is the service URL pointing to the BookKeeper table service. + +### Start Functions-worker with broker + +Once you have configured the `functions_worker.yml` file, you can start or restart your broker. + +And then you can use the following command to verify if `functions-worker` is running well. + +```bash + +curl :8080/admin/v2/worker/cluster + +``` + +After entering the command above, a list of active function workers in the cluster is returned. The output is similar to the following. + +```json + +[{"workerId":"","workerHostname":"","port":8080}] + +``` + +## Run Functions-worker separately + +This section illustrates how to run `functions-worker` as a separate process in separate machines. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated.png) + +:::note + +In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. Also, while accessing the `functions-worker` to manage any of the functions, the `pulsar-admin` CLI tool or any of the clients should use the `workerHostname` and `workerPort` that you set in [Worker parameters](#worker-parameters) to generate an `--admin-url`. + +::: + +### Configure Functions-worker to run separately + +To run function-worker separately, you have to configure the following parameters. + +#### Worker parameters + +- `workerId`: The type is string. It is unique across clusters, which is used to identify a worker machine. +- `workerHostname`: The hostname of the worker machine. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. + +#### Function package parameter + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. + +#### Function metadata parameter + +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulsar web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. + +- `brokerClientAuthenticationEnabled`: Whether to enable the broker client authentication used by function workers to talk to brokers. +- `clientAuthenticationPlugin`: The authentication plugin to be used by the Pulsar client used in worker service. +- `clientAuthenticationParameters`: The authentication parameter to be used by the Pulsar client used in worker service. + +#### Security settings + +If you want to enable security on functions workers, you *should*: +- [Enable TLS transport encryption](#enable-tls-transport-encryption) +- [Enable Authentication Provider](#enable-authentication-provider) +- [Enable Authorization Provider](#enable-authorization-provider) +- [Enable End-to-End Encryption](#enable-end-to-end-encryption) + +##### Enable TLS transport encryption + +To enable TLS transport encryption, configure the following settings. + +``` + +useTLS: true +pulsarServiceUrl: pulsar+ssl://localhost:6651/ +pulsarWebServiceUrl: https://localhost:8443 + +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem + +// The path to trusted certificates used by the Pulsar client to authenticate with Pulsar brokers +brokerClientTrustCertsFilePath: /path/to/ca.cert.pem + +``` + +For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + +##### Enable Authentication Provider + +To enable authentication on Functions Worker, you need to configure the following settings. + +:::note + +Substitute the *providers list* with the providers you want to enable. + +::: + +``` + +authenticationEnabled: true +authenticationProviders: [ provider1, provider2 ] + +``` + +For *TLS Authentication* provider, follow the example below to add the necessary settings. +See [TLS Authentication](security-tls-authentication.md) for more details. + +``` + +brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters: tlsCertFile:/path/to/admin.cert.pem,tlsKeyFile:/path/to/admin.key-pk8.pem + +authenticationEnabled: true +authenticationProviders: ['org.apache.pulsar.broker.authentication.AuthenticationProviderTls'] + +``` + +For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasBrokerSectionName` +under `properties` if needed. + +``` + +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasBrokerSectionName: Broker + +``` + +For *Token Authentication* provider, add necessary settings for `properties` if needed. +See [Token Authentication](security-jwt.md) for more details. +Note: key files must be DER-encoded + +``` + +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file:///path/to/public.key + +``` + +##### Enable Authorization Provider + +To enable authorization on Functions Worker, you need to configure `authorizationEnabled`, `authorizationProvider` and `configurationStoreServers`. The authentication provider connects to `configurationStoreServers` to receive namespace policies. + +```yaml + +authorizationEnabled: true +authorizationProvider: org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider +configurationStoreServers: + +``` + +You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. + +```yaml + +superUserRoles: + - role1 + - role2 + - role3 + +``` + +##### Enable End-to-End Encryption + +You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +To enable End-to-End encryption on Functions Worker, you can set it by specifying `--producer-config` in the command line terminal, for more information, please refer to [here](security-encryption.md). + +We include the relevant configuration information of `CryptoConfig` into `ProducerConfig`. The specific configurable field information about `CryptoConfig` is as follows: + +```text + +public class CryptoConfig { + private String cryptoKeyReaderClassName; + private Map cryptoKeyReaderConfig; + + private String[] encryptionKeys; + private ProducerCryptoFailureAction producerCryptoFailureAction; + + private ConsumerCryptoFailureAction consumerCryptoFailureAction; +} + +``` + +- `producerCryptoFailureAction`: define the action if producer fail to encrypt data one of `FAIL`, `SEND`. +- `consumerCryptoFailureAction`: define the action if consumer fail to decrypt data one of `FAIL`, `DISCARD`, `CONSUME`. + +#### BookKeeper Authentication + +If authentication is enabled on the BookKeeper cluster, you need configure the BookKeeper authentication settings as follows: + +- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. + +### Start Functions-worker + +Once you have finished configuring the `functions_worker.yml` configuration file, you can start a `functions-worker` in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash + +bin/pulsar-daemon start functions-worker + +``` + +You can also start `functions-worker` in the foreground by using `pulsar` CLI tool: + +```bash + +bin/pulsar functions-worker + +``` + +### Configure Proxies for Functions-workers + +When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served +by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. +Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. + +In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. + +If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](http://pulsar.apache.org/docs/en/administration-proxy/) to +start proxies. + +![assets/functions-worker-separated.png](/assets/functions-worker-separated-proxy.png) + +To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: + +```conf + +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= + +``` + +## Compare the Run-with-Broker and Run-separately modes + +As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. + +Use which mode for your cases, refer to the following guidelines to determine. + +Use the `Run-with-Broker` mode in the following cases: +- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; +- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). + +Use the `Run-separately` mode in the following cases: +- a) you don't have a Kubernetes cluster; +- b) if you want to run functions and brokers separately. + +## Troubleshooting + +**Error message: Namespace missing local cluster name in clusters list** + +``` + +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] + +``` + +The error message prompts when either of the cases occurs: +- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; +- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happens, follow the instructions below to fix the problem: + +1. Disable Functions Worker by setting `functionsWorkerEnabled=false`, and restart brokers. + +2. Get the current clusters list of `public/functions` namespace. + +```bash + +bin/pulsar-admin namespaces get-clusters public/functions + +``` + +3. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. + +```bash + +bin/pulsar-admin namespaces set-clusters --clusters , public/functions + +``` + +4. After setting the cluster successfully, enable functions worker by setting `functionsWorkerEnabled=true`. + +5. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file, and restart brokers. diff --git a/site2/website/versioned_docs/version-2.9.x/getting-started-concepts-and-architecture.md b/site2/website/versioned_docs/version-2.9.x/getting-started-concepts-and-architecture.md new file mode 100644 index 0000000000000..fe9c3fbc553b2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/getting-started-concepts-and-architecture.md @@ -0,0 +1,16 @@ +--- +id: concepts-architecture +title: Pulsar concepts and architecture +sidebar_label: "Concepts and architecture" +original_id: concepts-architecture +--- + + + + + + + + + + diff --git a/site2/website/versioned_docs/version-2.9.x/getting-started-docker.md b/site2/website/versioned_docs/version-2.9.x/getting-started-docker.md new file mode 100644 index 0000000000000..858ec05d4bc82 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/getting-started-docker.md @@ -0,0 +1,210 @@ +--- +id: getting-started-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: "Run Pulsar in Docker" +original_id: getting-started-docker +--- + +For local development and testing, you can run Pulsar in standalone mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) and follow the instructions for your OS. + +## Start Pulsar in Docker + +* For MacOS, Linux, and Windows: + + ```shell + + $ docker run -it -p 6650:6650 -p 8080:8080 --mount source=pulsardata,target=/pulsar/data --mount source=pulsarconf,target=/pulsar/conf apachepulsar/pulsar:@pulsar:version@ bin/pulsar standalone + + ``` + +A few things to note about this command: + * The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every +time the container is restarted. For details on the volumes you can use `docker volume inspect ` + * For Docker on Windows make sure to configure it to use Linux containers + +If you start Pulsar successfully, you will see `INFO`-level log messages like this: + +``` + +08:18:30.970 [main] INFO org.apache.pulsar.broker.web.WebService - HTTP Service started at http://0.0.0.0:8080 +... +07:53:37.322 [main] INFO org.apache.pulsar.broker.PulsarService - messaging service is ready, bootstrap service port = 8080, broker url= pulsar://localhost:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@98b63c1 +... + +``` + +:::tip + +When you start a local standalone cluster, a `public/default` namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +::: + +## Use Pulsar in Docker + +Pulsar offers client libraries for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs to interact with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example will guide you get started with Pulsar quickly by using the [Python client API](client-libraries-python.md) +client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell + +$ pip install pulsar-client + +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() + +``` + +### Produce a message + +Now start a producer to send some test messages: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() + +``` + +## Get the topic statistics + +In Pulsar, you can use REST, Java, or command-line tools to control every aspect of the system. +For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell + +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool + +``` + +The output is something like this: + +```json + +{ + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesInCounter": 7097, + "msgInCounter": 143, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "averageMsgSize": 0.0, + "msgChunkPublished": false, + "storageSize": 7097, + "backlogSize": 0, + "offloadedStorageSize": 0, + "publishers": [ + { + "accessMode": "Shared", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "averageMsgSize": 0.0, + "chunkedMessageRate": 0.0, + "producerId": 0, + "metadata": {}, + "address": "/127.0.0.1:35604", + "connectedSince": "2021-07-04T09:05:43.04788Z", + "clientVersion": "2.8.0", + "producerName": "standalone-2-5" + } + ], + "waitingPublishers": 0, + "subscriptions": { + "my-sub": { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0, + "msgBacklog": 0, + "backlogSize": 0, + "msgBacklogNoDelayed": 0, + "blockedSubscriptionOnUnackedMsgs": false, + "msgDelayed": 0, + "unackedMessages": 0, + "type": "Exclusive", + "activeConsumerName": "3c544f1daa", + "msgRateExpired": 0.0, + "totalMsgExpired": 0, + "lastExpireTimestamp": 0, + "lastConsumedFlowTimestamp": 1625389101290, + "lastConsumedTimestamp": 1625389546070, + "lastAckedTimestamp": 1625389546162, + "lastMarkDeleteAdvancedTimestamp": 1625389546163, + "consumers": [ + { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0.0, + "consumerName": "3c544f1daa", + "availablePermits": 867, + "unackedMessages": 0, + "avgMessagesPerEntry": 6, + "blockedConsumerOnUnackedMsgs": false, + "lastAckedTimestamp": 1625389546162, + "lastConsumedTimestamp": 1625389546070, + "metadata": {}, + "address": "/127.0.0.1:35472", + "connectedSince": "2021-07-04T08:58:21.287682Z", + "clientVersion": "2.8.0" + } + ], + "isDurable": true, + "isReplicated": false, + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition": {}, + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0, + "durable": true, + "replicated": false + } + }, + "replication": {}, + "deduplicationStatus": "Disabled", + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0 +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/getting-started-helm.md b/site2/website/versioned_docs/version-2.9.x/getting-started-helm.md new file mode 100644 index 0000000000000..5e9f7044a6d74 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/getting-started-helm.md @@ -0,0 +1,447 @@ +--- +id: getting-started-helm +title: Get started in Kubernetes +sidebar_label: "Run Pulsar in Kubernetes" +original_id: getting-started-helm +--- + +This section guides you through every step of installing and running Apache Pulsar with Helm on Kubernetes quickly, including the following sections: + +- Install the Apache Pulsar on Kubernetes using Helm +- Start and stop Apache Pulsar +- Create topics using `pulsar-admin` +- Produce and consume messages using Pulsar clients +- Monitor Apache Pulsar status with Prometheus and Grafana + +For deploying a Pulsar cluster for production usage, read the documentation on [how to configure and install a Pulsar Helm chart](helm-deploy.md). + +## Prerequisite + +- Kubernetes server 1.14.0+ +- kubectl 1.14.0+ +- Helm 3.0+ + +:::tip + +For the following steps, step 2 and step 3 are for **developers** and step 4 and step 5 are for **administrators**. + +::: + +## Step 0: Prepare a Kubernetes cluster + +Before installing a Pulsar Helm chart, you have to create a Kubernetes cluster. You can follow [the instructions](helm-prepare.md) to prepare a Kubernetes cluster. + +We use [Minikube](https://minikube.sigs.k8s.io/docs/start/) in this quick start guide. To prepare a Kubernetes cluster, follow these steps: + +1. Create a Kubernetes cluster on Minikube. + + ```bash + + minikube start --memory=8192 --cpus=4 --kubernetes-version= + + ``` + + The `` can be any [Kubernetes version supported by your Minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/), such as `v1.16.1`. + +2. Set `kubectl` to use Minikube. + + ```bash + + kubectl config use-context minikube + + ``` + +3. To use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with the local Kubernetes cluster on Minikube, enter the command below: + + ```bash + + minikube dashboard + + ``` + + The command automatically triggers opening a webpage in your browser. + +## Step 1: Install Pulsar Helm chart + +1. Add Pulsar charts repo. + + ```bash + + helm repo add apache https://pulsar.apache.org/charts + + ``` + + ```bash + + helm repo update + + ``` + +2. Clone the Pulsar Helm chart repository. + + ```bash + + git clone https://github.com/apache/pulsar-helm-chart + cd pulsar-helm-chart + + ``` + +3. Run the script `prepare_helm_release.sh` to create secrets required for installing the Apache Pulsar Helm chart. The username `pulsar` and password `pulsar` are used for logging into the Grafana dashboard and Pulsar Manager. + + :::note + + When running the script, you can use `-n` to specify the Kubernetes namespace where the Pulsar Helm chart is installed, `-k` to define the Pulsar Helm release name, and `-c` to create the Kubernetes namespace. For more information about the script, run `./scripts/pulsar/prepare_helm_release.sh --help`. + + ::: + + ```bash + + ./scripts/pulsar/prepare_helm_release.sh \ + -n pulsar \ + -k pulsar-mini \ + -c + + ``` + +4. Use the Pulsar Helm chart to install a Pulsar cluster to Kubernetes. + + :::note + + You need to specify `--set initialize=true` when installing Pulsar the first time. This command installs and starts Apache Pulsar. + + ::: + + ```bash + + helm install \ + --values examples/values-minikube.yaml \ + --set initialize=true \ + --namespace pulsar \ + pulsar-mini apache/pulsar + + ``` + +5. Check the status of all pods. + + ```bash + + kubectl get pods -n pulsar + + ``` + + If all pods start up successfully, you can see that the `STATUS` is changed to `Running` or `Completed`. + + **Output** + + ```bash + + NAME READY STATUS RESTARTS AGE + pulsar-mini-bookie-0 1/1 Running 0 9m27s + pulsar-mini-bookie-init-5gphs 0/1 Completed 0 9m27s + pulsar-mini-broker-0 1/1 Running 0 9m27s + pulsar-mini-grafana-6b7bcc64c7-4tkxd 1/1 Running 0 9m27s + pulsar-mini-prometheus-5fcf5dd84c-w8mgz 1/1 Running 0 9m27s + pulsar-mini-proxy-0 1/1 Running 0 9m27s + pulsar-mini-pulsar-init-t7cqt 0/1 Completed 0 9m27s + pulsar-mini-pulsar-manager-9bcbb4d9f-htpcs 1/1 Running 0 9m27s + pulsar-mini-toolset-0 1/1 Running 0 9m27s + pulsar-mini-zookeeper-0 1/1 Running 0 9m27s + + ``` + +6. Check the status of all services in the namespace `pulsar`. + + ```bash + + kubectl get services -n pulsar + + ``` + + **Output** + + ```bash + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + pulsar-mini-bookie ClusterIP None 3181/TCP,8000/TCP 11m + pulsar-mini-broker ClusterIP None 8080/TCP,6650/TCP 11m + pulsar-mini-grafana LoadBalancer 10.106.141.246 3000:31905/TCP 11m + pulsar-mini-prometheus ClusterIP None 9090/TCP 11m + pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 11m + pulsar-mini-pulsar-manager LoadBalancer 10.103.192.175 9527:30190/TCP 11m + pulsar-mini-toolset ClusterIP None 11m + pulsar-mini-zookeeper ClusterIP None 2888/TCP,3888/TCP,2181/TCP 11m + + ``` + +## Step 2: Use pulsar-admin to create Pulsar tenants/namespaces/topics + +`pulsar-admin` is the CLI (command-Line Interface) tool for Pulsar. In this step, you can use `pulsar-admin` to create resources, including tenants, namespaces, and topics. + +1. Enter the `toolset` container. + + ```bash + + kubectl exec -it -n pulsar pulsar-mini-toolset-0 -- /bin/bash + + ``` + +2. In the `toolset` container, create a tenant named `apache`. + + ```bash + + bin/pulsar-admin tenants create apache + + ``` + + Then you can list the tenants to see if the tenant is created successfully. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + You should see a similar output as below. The tenant `apache` has been successfully created. + + ```bash + + "apache" + "public" + "pulsar" + + ``` + +3. In the `toolset` container, create a namespace named `pulsar` in the tenant `apache`. + + ```bash + + bin/pulsar-admin namespaces create apache/pulsar + + ``` + + Then you can list the namespaces of tenant `apache` to see if the namespace is created successfully. + + ```bash + + bin/pulsar-admin namespaces list apache + + ``` + + You should see a similar output as below. The namespace `apache/pulsar` has been successfully created. + + ```bash + + "apache/pulsar" + + ``` + +4. In the `toolset` container, create a topic `test-topic` with `4` partitions in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics create-partitioned-topic apache/pulsar/test-topic -p 4 + + ``` + +5. In the `toolset` container, list all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics list-partitioned-topics apache/pulsar + + ``` + + Then you can see all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + "persistent://apache/pulsar/test-topic" + + ``` + +## Step 3: Use Pulsar client to produce and consume messages + +You can use the Pulsar client to create producers and consumers to produce and consume messages. + +By default, the Pulsar Helm chart exposes the Pulsar cluster through a Kubernetes `LoadBalancer`. In Minikube, you can use the following command to check the proxy service. + +```bash + +kubectl get services -n pulsar | grep pulsar-mini-proxy + +``` + +You will see a similar output as below. + +```bash + +pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 28m + +``` + +This output tells what are the node ports that Pulsar cluster's binary port and HTTP port are mapped to. The port after `80:` is the HTTP port while the port after `6650:` is the binary port. + +Then you can find the IP address and exposed ports of your Minikube server by running the following command. + +```bash + +minikube service pulsar-mini-proxy -n pulsar + +``` + +**Output** + +```bash + +|-----------|-------------------|-------------|-------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|-------------------------| +| pulsar | pulsar-mini-proxy | http/80 | http://172.17.0.4:32305 | +| | | pulsar/6650 | http://172.17.0.4:31816 | +|-----------|-------------------|-------------|-------------------------| +🏃 Starting tunnel for service pulsar-mini-proxy. +|-----------|-------------------|-------------|------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|------------------------| +| pulsar | pulsar-mini-proxy | | http://127.0.0.1:61853 | +| | | | http://127.0.0.1:61854 | +|-----------|-------------------|-------------|------------------------| + +``` + +At this point, you can get the service URLs to connect to your Pulsar client. Here are URL examples: + +``` + +webServiceUrl=http://127.0.0.1:61853/ +brokerServiceUrl=pulsar://127.0.0.1:61854/ + +``` + +Then you can proceed with the following steps: + +1. Download the Apache Pulsar tarball from the [downloads page](https://pulsar.apache.org/download/). + +2. Decompress the tarball based on your download file. + + ```bash + + tar -xf .tar.gz + + ``` + +3. Expose `PULSAR_HOME`. + + (1) Enter the directory of the decompressed download file. + + (2) Expose `PULSAR_HOME` as the environment variable. + + ```bash + + export PULSAR_HOME=$(pwd) + + ``` + +4. Configure the Pulsar client. + + In the `${PULSAR_HOME}/conf/client.conf` file, replace `webServiceUrl` and `brokerServiceUrl` with the service URLs you get from the above steps. + +5. Create a subscription to consume messages from `apache/pulsar/test-topic`. + + ```bash + + bin/pulsar-client consume -s sub apache/pulsar/test-topic -n 0 + + ``` + +6. Open a new terminal. In the new terminal, create a producer and send 10 messages to the `test-topic` topic. + + ```bash + + bin/pulsar-client produce apache/pulsar/test-topic -m "---------hello apache pulsar-------" -n 10 + + ``` + +7. Verify the results. + + - From the producer side + + **Output** + + The messages have been produced successfully. + + ```bash + + 18:15:15.489 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 10 messages successfully produced + + ``` + + - From the consumer side + + **Output** + + At the same time, you can receive the messages as below. + + ```bash + + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + + ``` + +## Step 4: Use Pulsar Manager to manage the cluster + +[Pulsar Manager](administration-pulsar-manager.md) is a web-based GUI management tool for managing and monitoring Pulsar. + +1. By default, the `Pulsar Manager` is exposed as a separate `LoadBalancer`. You can open the Pulsar Manager UI using the following command: + + ```bash + + minikube service -n pulsar pulsar-mini-pulsar-manager + + ``` + +2. The Pulsar Manager UI will be open in your browser. You can use the username `pulsar` and password `pulsar` to log into Pulsar Manager. + +3. In Pulsar Manager UI, you can create an environment. + + - Click `New Environment` button in the top-left corner. + - Type `pulsar-mini` for the field `Environment Name` in the popup window. + - Type `http://pulsar-mini-broker:8080` for the field `Service URL` in the popup window. + - Click `Confirm` button in the popup window. + +4. After successfully creating an environment, you are redirected to the `tenants` page of that environment. Then you can create `tenants`, `namespaces` and `topics` using the Pulsar Manager. + +## Step 5: Use Prometheus and Grafana to monitor cluster + +Grafana is an open-source visualization tool, which can be used for visualizing time series data into dashboards. + +1. By default, the Grafana is exposed as a separate `LoadBalancer`. You can open the Grafana UI using the following command: + + ```bash + + minikube service pulsar-mini-grafana -n pulsar + + ``` + +2. The Grafana UI is open in your browser. You can use the username `pulsar` and password `pulsar` to log into the Grafana Dashboard. + +3. You can view dashboards for different components of a Pulsar cluster. diff --git a/site2/website/versioned_docs/version-2.9.x/getting-started-pulsar.md b/site2/website/versioned_docs/version-2.9.x/getting-started-pulsar.md new file mode 100644 index 0000000000000..752590f57b558 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/getting-started-pulsar.md @@ -0,0 +1,72 @@ +--- +id: pulsar-2.0 +title: Pulsar 2.0 +sidebar_label: "Pulsar 2.0" +original_id: pulsar-2.0 +--- + +Pulsar 2.0 is a major new release for Pulsar that brings some bold changes to the platform, including [simplified topic names](#topic-names), the addition of the [Pulsar Functions](functions-overview.md) feature, some terminology changes, and more. + +## New features in Pulsar 2.0 + +Feature | Description +:-------|:----------- +[Pulsar Functions](functions-overview.md) | A lightweight compute option for Pulsar + +## Major changes + +There are a few major changes that you should be aware of, as they may significantly impact your day-to-day usage. + +### Properties versus tenants + +Previously, Pulsar had a concept of properties. A property is essentially the exact same thing as a tenant, so the "property" terminology has been removed in version 2.0. The [`pulsar-admin properties`](reference-pulsar-admin.md#pulsar-admin) command-line interface, for example, has been replaced with the [`pulsar-admin tenants`](reference-pulsar-admin.md#pulsar-admin-tenants) interface. In some cases the properties terminology is still used but is now considered deprecated and will be removed entirely in a future release. + +### Topic names + +Prior to version 2.0, *all* Pulsar topics had the following form: + +```http + +{persistent|non-persistent}://property/cluster/namespace/topic + +``` + +Two important changes have been made in Pulsar 2.0: + +* There is no longer a [cluster component](#no-cluster) +* Properties have been [renamed to tenants](#tenants) +* You can use a [flexible](#flexible-topic-naming) naming system to shorten many topic names +* `/` is not allowed in topic name + +#### No cluster component + +The cluster component has been removed from topic names. Thus, all topic names now have the following form: + +```http + +{persistent|non-persistent}://tenant/namespace/topic + +``` + +> Existing topics that use the legacy name format will continue to work without any change, and there are no plans to change that. + + +#### Flexible topic naming + +All topic names in Pulsar 2.0 internally have the form shown [above](#no-cluster-component) but you can now use shorthand names in many cases (for the sake of simplicity). The flexible naming system stems from the fact that there is now a default topic type, tenant, and namespace: + +Topic aspect | Default +:------------|:------- +topic type | `persistent` +tenant | `public` +namespace | `default` + +The table below shows some example topic name translations that use implicit defaults: + +Input topic name | Translated topic name +:----------------|:--------------------- +`my-topic` | `persistent://public/default/my-topic` +`my-tenant/my-namespace/my-topic` | `persistent://my-tenant/my-namespace/my-topic` + +> For [non-persistent topics](concepts-messaging.md#non-persistent-topics) you'll need to continue to specify the entire topic name, as the default-based rules for persistent topic names don't apply. Thus you cannot use a shorthand name like `non-persistent://my-topic` and would need to use `non-persistent://public/default/my-topic` instead + diff --git a/site2/website/versioned_docs/version-2.9.x/getting-started-standalone.md b/site2/website/versioned_docs/version-2.9.x/getting-started-standalone.md new file mode 100644 index 0000000000000..8192f374300d6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/getting-started-standalone.md @@ -0,0 +1,268 @@ +--- +id: getting-started-standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: getting-started-standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> **Pulsar in production?** +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of installing Pulsar locally. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +- Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +- To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +21:59:29.327 [DLM-/stream/storage-OrderedScheduler-3-0] INFO org.apache.bookkeeper.stream.storage.impl.sc.StorageContainerImpl - Successfully started storage container (0). +21:59:34.576 [main] INFO org.apache.pulsar.broker.authentication.AuthenticationService - Authentication is disabled +21:59:34.576 [main] INFO org.apache.pulsar.websocket.WebSocketService - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:17:16.781 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully consumed + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:21:08.693 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/helm-deploy.md b/site2/website/versioned_docs/version-2.9.x/helm-deploy.md new file mode 100644 index 0000000000000..0e7815e4f4d90 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-deploy.md @@ -0,0 +1,434 @@ +--- +id: helm-deploy +title: Deploy Pulsar cluster using Helm +sidebar_label: "Deployment" +original_id: helm-deploy +--- + +Before running `helm install`, you need to decide how to run Pulsar. +Options can be specified using Helm's `--set option.name=value` command line option. + +## Select configuration options + +In each section, collect the options that are combined to use with the `helm install` command. + +### Kubernetes namespace + +By default, the Pulsar Helm chart is installed to a namespace called `pulsar`. + +```yaml + +namespace: pulsar + +``` + +To install the Pulsar Helm chart into a different Kubernetes namespace, you can include this option in the `helm install` command. + +```bash + +--set namespace= + +``` + +By default, the Pulsar Helm chart doesn't create the namespace. + +```yaml + +namespaceCreate: false + +``` + +To use the Pulsar Helm chart to create the Kubernetes namespace automatically, you can include this option in the `helm install` command. + +```bash + +--set namespaceCreate=true + +``` + +### Persistence + +By default, the Pulsar Helm chart creates Volume Claims with the expectation that a dynamic provisioner creates the underlying Persistent Volumes. + +```yaml + +volumes: + persistence: true + # configure the components to use local persistent volume + # the local provisioner should be installed prior to enable local persistent volume + local_storage: false + +``` + +To use local persistent volumes as the persistent storage for Helm release, you can install the [local storage provisioner](#install-local-storage-provisioner) and include the following option in the `helm install` command. + +```bash + +--set volumes.local_storage=true + +``` + +:::note + +Before installing the production instance of Pulsar, ensure to plan the storage settings to avoid extra storage migration work. Because after initial installation, you must edit Kubernetes objects manually if you want to change storage settings. + +::: + +The Pulsar Helm chart is designed for production use. To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable persistence by including this option in your `helm install` command. + +```bash + +--set volumes.persistence=false + +``` + +### Affinity + +By default, `anti-affinity` is enabled to ensure pods of the same component can run on different nodes. + +```yaml + +affinity: + anti_affinity: true + +``` + +To use the Pulsar Helm chart in a development environment (such as Minikube), you can disable `anti-affinity` by including this option in your `helm install` command. + +```bash + +--set affinity.anti_affinity=false + +``` + +### Components + +The Pulsar Helm chart is designed for production usage. It deploys a production-ready Pulsar cluster, including Pulsar core components and monitoring components. + +You can customize the components to be deployed by turning on/off individual components. + +```yaml + +## Components +## +## Control what components of Apache Pulsar to deploy for the cluster +components: + # zookeeper + zookeeper: true + # bookkeeper + bookkeeper: true + # bookkeeper - autorecovery + autorecovery: true + # broker + broker: true + # functions + functions: true + # proxy + proxy: true + # toolset + toolset: true + # pulsar manager + pulsar_manager: true + +## Monitoring Components +## +## Control what components of the monitoring stack to deploy for the cluster +monitoring: + # monitoring - prometheus + prometheus: true + # monitoring - grafana + grafana: true + +``` + +### Docker images + +The Pulsar Helm chart is designed to enable controlled upgrades. So it can configure independent image versions for components. You can customize the images by setting individual component. + +```yaml + +## Images +## +## Control what images to use for each component +images: + zookeeper: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + bookie: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + autorecovery: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + broker: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + proxy: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + pullPolicy: IfNotPresent + functions: + repository: apachepulsar/pulsar-all + tag: 2.5.0 + prometheus: + repository: prom/prometheus + tag: v1.6.3 + pullPolicy: IfNotPresent + grafana: + repository: streamnative/apache-pulsar-grafana-dashboard-k8s + tag: 0.0.4 + pullPolicy: IfNotPresent + pulsar_manager: + repository: apachepulsar/pulsar-manager + tag: v0.1.0 + pullPolicy: IfNotPresent + hasCommand: false + +``` + +### TLS + +The Pulsar Helm chart can be configured to enable TLS (Transport Layer Security) to protect all the traffic between components. Before enabling TLS, you have to provision TLS certificates for the required components. + +#### Provision TLS certificates using cert-manager + +To use the `cert-manager` to provision the TLS certificates, you have to install the [cert-manager](#install-cert-manager) before installing the Pulsar Helm chart. After successfully installing the cert-manager, you can set `certs.internal_issuer.enabled` to `true`. Therefore, the Pulsar Helm chart can use the `cert-manager` to generate `selfsigning` TLS certificates for the configured components. + +```yaml + +certs: + internal_issuer: + enabled: false + component: internal-cert-issuer + type: selfsigning + +``` + +You can also customize the generated TLS certificates by configuring the fields as the following. + +```yaml + +tls: + # common settings for generating certs + common: + # 90d + duration: 2160h + # 15d + renewBefore: 360h + organization: + - pulsar + keySize: 4096 + keyAlgorithm: rsa + keyEncoding: pkcs8 + +``` + +#### Enable TLS + +After installing the `cert-manager`, you can set `tls.enabled` to `true` to enable TLS encryption for the entire cluster. + +```yaml + +tls: + enabled: false + +``` + +You can also configure whether to enable TLS encryption for individual component. + +```yaml + +tls: + # settings for generating certs for proxy + proxy: + enabled: false + cert_name: tls-proxy + # settings for generating certs for broker + broker: + enabled: false + cert_name: tls-broker + # settings for generating certs for bookies + bookie: + enabled: false + cert_name: tls-bookie + # settings for generating certs for zookeeper + zookeeper: + enabled: false + cert_name: tls-zookeeper + # settings for generating certs for recovery + autorecovery: + cert_name: tls-recovery + # settings for generating certs for toolset + toolset: + cert_name: tls-toolset + +``` + +### Authentication + +By default, authentication is disabled. You can set `auth.authentication.enabled` to `true` to enable authentication. +Currently, the Pulsar Helm chart only supports JWT authentication provider. You can set `auth.authentication.provider` to `jwt` to use the JWT authentication provider. + +```yaml + +# Enable or disable broker authentication and authorization. +auth: + authentication: + enabled: false + provider: "jwt" + jwt: + # Enable JWT authentication + # If the token is generated by a secret key, set the usingSecretKey as true. + # If the token is generated by a private key, set the usingSecretKey as false. + usingSecretKey: false + superUsers: + # broker to broker communication + broker: "broker-admin" + # proxy to broker communication + proxy: "proxy-admin" + # pulsar-admin client to broker/proxy communication + client: "admin" + +``` + +To enable authentication, you can run [prepare helm release](#prepare-the-helm-release) to generate token secret keys and tokens for three super users specified in the `auth.superUsers` field. The generated token keys and super user tokens are uploaded and stored as Kubernetes secrets prefixed with `-token-`. You can use the following command to find those secrets. + +```bash + +kubectl get secrets -n + +``` + +### Authorization + +By default, authorization is disabled. Authorization can be enabled only when authentication is enabled. + +```yaml + +auth: + authorization: + enabled: false + +``` + +To enable authorization, you can include this option in the `helm install` command. + +```bash + +--set auth.authorization.enabled=true + +``` + +### CPU and RAM resource requirements + +By default, the resource requests and the number of replicas for the Pulsar components in the Pulsar Helm chart are adequate for a small production deployment. If you deploy a non-production instance, you can reduce the defaults to fit into a smaller cluster. + +Once you have all of your configuration options collected, you can install dependent charts before installing the Pulsar Helm chart. + +## Install dependent charts + +### Install local storage provisioner + +To use local persistent volumes as the persistent storage, you need to install a storage provisioner for [local persistent volumes](https://kubernetes.io/blog/2019/04/04/kubernetes-1.14-local-persistent-volumes-ga/). + +One of the easiest way to get started is to use the local storage provisioner provided along with the Pulsar Helm chart. + +``` + +helm repo add streamnative https://charts.streamnative.io +helm repo update +helm install pulsar-storage-provisioner streamnative/local-storage-provisioner + +``` + +### Install cert-manager + +The Pulsar Helm chart uses the [cert-manager](https://github.com/jetstack/cert-manager) to provision and manage TLS certificates automatically. To enable TLS encryption for brokers or proxies, you need to install the cert-manager in advance. + +For details about how to install the cert-manager, follow the [official instructions](https://cert-manager.io/docs/installation/kubernetes/#installing-with-helm). + +Alternatively, we provide a bash script [install-cert-manager.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/cert-manager/install-cert-manager.sh) to install a cert-manager release to the namespace `cert-manager`. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/cert-manager/install-cert-manager.sh + +``` + +## Prepare Helm release + +Once you have install all the dependent charts and collected all of your configuration options, you can run [prepare_helm_release.sh](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh) to prepare the Helm release. + +```bash + +git clone https://github.com/apache/pulsar-helm-chart +cd pulsar-helm-chart +./scripts/pulsar/prepare_helm_release.sh -n -k + +``` + +The `prepare_helm_release` creates the following resources: + +- A Kubernetes namespace for installing the Pulsar release +- JWT secret keys and tokens for three super users: `broker-admin`, `proxy-admin`, and `admin`. By default, it generates an asymmetric pubic/private key pair. You can choose to generate a symmetric secret key by specifying `--symmetric`. + - `proxy-admin` role is used for proxies to communicate to brokers. + - `broker-admin` role is used for inter-broker communications. + - `admin` role is used by the admin tools. + +## Deploy Pulsar cluster using Helm + +Once you have finished the following three things, you can install a Helm release. + +- Collect all of your configuration options. +- Install dependent charts. +- Prepare the Helm release. + +In this example, the Helm release is named `pulsar`. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm install pulsar apache/pulsar \ + --timeout 10m \ + --set initialize=true \ + --set [your configuration options] + +``` + +:::note + +For the first deployment, add `--set initialize=true` option to initialize bookie and Pulsar cluster metadata. + +::: + +You can also use the `--version ` option if you want to install a specific version of Pulsar Helm chart. + +## Monitor deployment + +A list of installed resources are output once the Pulsar cluster is deployed. This may take 5-10 minutes. + +The status of the deployment can be checked by running the `helm status pulsar` command, which can also be done while the deployment is taking place if you run the command in another terminal. + +## Access Pulsar cluster + +The default values will create a `ClusterIP` for the following resources, which you can use to interact with the cluster. + +- Proxy: You can use the IP address to produce and consume messages to the installed Pulsar cluster. +- Pulsar Manager: You can access the Pulsar Manager UI at `http://:9527`. +- Grafana Dashboard: You can access the Grafana dashboard at `http://:3000`. + +To find the IP addresses of those components, run the following command: + +```bash + +kubectl get service -n + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/helm-install.md b/site2/website/versioned_docs/version-2.9.x/helm-install.md new file mode 100644 index 0000000000000..9f81f52e0dab1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-install.md @@ -0,0 +1,38 @@ +--- +id: helm-install +title: Install Apache Pulsar using Helm +sidebar_label: "Install" +original_id: helm-install +--- + +Install Apache Pulsar on Kubernetes with the official Pulsar Helm chart. + +## Requirements + +To deploy Apache Pulsar on Kubernetes, the followings are required. + +- kubectl 1.14 or higher, compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)) +- Helm v3 (3.0.2 or higher) +- A Kubernetes cluster, version 1.14 or higher + +## Environment setup + +Before deploying Pulsar, you need to prepare your environment. + +### Tools + +Install [`helm`](helm-tools.md) and [`kubectl`](helm-tools.md) on your computer. + +## Cloud cluster preparation + +To create and connect to the Kubernetes cluster, follow the instructions: + +- [Google Kubernetes Engine](helm-prepare.md#google-kubernetes-engine) + +## Pulsar deployment + +Once the environment is set up and configuration is generated, you can now proceed to the [deployment of Pulsar](helm-deploy.md). + +## Pulsar upgrade + +To upgrade an existing Kubernetes installation, follow the [upgrade documentation](helm-upgrade.md). diff --git a/site2/website/versioned_docs/version-2.9.x/helm-overview.md b/site2/website/versioned_docs/version-2.9.x/helm-overview.md new file mode 100644 index 0000000000000..125f595cbe68a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-overview.md @@ -0,0 +1,103 @@ +--- +id: helm-overview +title: Apache Pulsar Helm Chart +sidebar_label: "Overview" +original_id: helm-overview +--- + +[Helm chart](https://github.com/apache/pulsar-helm-chart) supports you to install Apache Pulsar in a cloud-native environment. + +## Introduction + +The Apache Pulsar Helm chart provides one of the most convenient ways to operate Pulsar on Kubernetes. With all the required components, Helm chart is scalable and thus being suitable for large-scale deployments. + +The Apache Pulsar Helm chart contains all components to support the features and functions that Pulsar delivers. You can install and configure these components separately. + +- Pulsar core components: + - ZooKeeper + - Bookies + - Brokers + - Function workers + - Proxies +- Control center: + - Pulsar Manager + - Prometheus + - Grafana + +Moreover, Helm chart supports: + +- Security + - Automatically provisioned TLS certificates, using [Jetstack](https://www.jetstack.io/)'s [cert-manager](https://cert-manager.io/docs/) + - self-signed + - [Let's Encrypt](https://letsencrypt.org/) + - TLS Encryption + - Proxy + - Broker + - Toolset + - Bookie + - ZooKeeper + - Authentication + - JWT + - Authorization +- Storage + - Non-persistence storage + - Persistent volume + - Local persistent volumes +- Functions + - Kubernetes Runtime + - Process Runtime + - Thread Runtime +- Operations + - Independent image versions for all components, enabling controlled upgrades + +## Quick start + +To run with Apache Pulsar Helm chart as fast as possible in a **non-production** use case, we provide a [quick start guide](getting-started-helm.md) for Proof of Concept (PoC) deployments. + +This guide walks you through deploying Apache Pulsar Helm chart with default values and features, but it is *not* suitable for deployments in production-ready environments. To deploy the charts in production under sustained load, you can follow the complete [Installation Guide](helm-install.md). + +## Troubleshooting + +Although we have done our best to make these charts as seamless as possible, troubles do go out of our control occasionally. We have been collecting tips and tricks for troubleshooting common issues. Please check it first before raising an [issue](https://github.com/apache/pulsar/issues/new/choose), and feel free to add your solutions by creating a [Pull Request](https://github.com/apache/pulsar/compare). + +## Installation + +The Apache Pulsar Helm chart contains all required dependencies. + +If you deploy a PoC for testing, we strongly suggest you follow this [Quick Start Guide](getting-started-helm.md) for your first iteration. + +1. [Preparation](helm-prepare.md) +2. [Deployment](helm-deploy.md) + +## Upgrading + +Once the Apache Pulsar Helm chart is installed, you can use `helm upgrade` command to configure and update it. + +```bash + +helm repo add apache https://pulsar.apache.org/charts +helm repo update +helm get values > pulsar.yaml +helm upgrade apache/pulsar -f pulsar.yaml + +``` + +For more detailed information, see [Upgrading](helm-upgrade.md). + +## Uninstallation + +To uninstall the Apache Pulsar Helm chart, run the following command: + +```bash + +helm delete + +``` + +For the purposes of continuity, some Kubernetes objects in these charts cannot be removed by `helm delete` command. It is recommended to *consciously* remove these items, as they affect re-deployment. + +* PVCs for stateful data: remove these items. + - ZooKeeper: This is your metadata. + - BookKeeper: This is your data. + - Prometheus: This is your metrics data, which can be safely removed. +* Secrets: if the secrets are generated by the [prepare release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/prepare_helm_release.sh), they contain secret keys and tokens. You can use the [cleanup release script](https://github.com/apache/pulsar-helm-chart/blob/master/scripts/pulsar/cleanup_helm_release.sh) to remove these secrets and tokens as needed. diff --git a/site2/website/versioned_docs/version-2.9.x/helm-prepare.md b/site2/website/versioned_docs/version-2.9.x/helm-prepare.md new file mode 100644 index 0000000000000..e5d56c7e95e34 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-prepare.md @@ -0,0 +1,80 @@ +--- +id: helm-prepare +title: Prepare Kubernetes resources +sidebar_label: "Prepare" +original_id: helm-prepare +--- + +For a fully functional Pulsar cluster, you need a few resources before deploying the Apache Pulsar Helm chart. The following provides instructions to prepare the Kubernetes cluster before deploying the Pulsar Helm chart. + +- [Google Kubernetes Engine](#google-kubernetes-engine) + - [Manual cluster creation](#manual-cluster-creation) + - [Scripted cluster creation](#scripted-cluster-creation) + - [Create cluster with local SSDs](#create-cluster-with-local-ssds) + +## Google Kubernetes Engine + +To get started easier, a script is provided to create the cluster automatically. Alternatively, a cluster can be created manually as well. + +### Manual cluster creation + +To provision a Kubernetes cluster manually, follow the [GKE instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-cluster). + +### Scripted cluster creation + +A [bootstrap script](https://github.com/streamnative/charts/tree/master/scripts/pulsar/gke_bootstrap_script.sh) has been created to automate much of the setup process for users on GCP/GKE. + +The script can: + +1. Create a new GKE cluster. +2. Allow the cluster to modify DNS (Domain Name Server) records. +3. Setup `kubectl`, and connect it to the cluster. + +Google Cloud SDK is a dependency of this script, so ensure it is [set up correctly](helm-tools.md#connect-to-a-gke-cluster) for the script to work. + +The script reads various parameters from environment variables and an argument `up` or `down` for bootstrap and clean-up respectively. + +The following table describes all variables. + +| **Variable** | **Description** | **Default value** | +| ------------ | --------------- | ----------------- | +| PROJECT | ID of your GCP project | No default value. It requires to be set. | +| CLUSTER_NAME | Name of the GKE cluster | `pulsar-dev` | +| CONFDIR | Configuration directory to store Kubernetes configuration | ${HOME}/.config/streamnative | +| INT_NETWORK | IP space to use within this cluster | `default` | +| LOCAL_SSD_COUNT | Number of local SSD counts | 4 | +| MACHINE_TYPE | Type of machine to use for nodes | `n1-standard-4` | +| NUM_NODES | Number of nodes to be created in each of the cluster's zones | 4 | +| PREEMPTIBLE | Create nodes using preemptible VM instances in the new cluster. | false | +| REGION | Compute region for the cluster | `us-east1` | +| USE_LOCAL_SSD | Flag to create a cluster with local SSDs | false | +| ZONE | Compute zone for the cluster | `us-east1-b` | +| ZONE_EXTENSION | The extension (`a`, `b`, `c`) of the zone name of the cluster | `b` | +| EXTRA_CREATE_ARGS | Extra arguments passed to create command | | + +Run the script, by passing in your desired parameters. It can work with the default parameters except for `PROJECT` which is required: + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + +The script can also be used to clean up the created GKE resources. + +```bash + +PROJECT= scripts/pulsar/gke_bootstrap_script.sh down + +``` + +#### Create cluster with local SSDs + +To install the Pulsar Helm chart using local persistent volumes, you need to create a GKE cluster with local SSDs. You can do so by specifying `USE_LOCAL_SSD` to be `true` in the following command to create a Pulsar cluster with local SSDs. + +``` + +PROJECT= USE_LOCAL_SSD=true LOCAL_SSD_COUNT= scripts/pulsar/gke_bootstrap_script.sh up + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/helm-tools.md b/site2/website/versioned_docs/version-2.9.x/helm-tools.md new file mode 100644 index 0000000000000..6ba89006913b6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-tools.md @@ -0,0 +1,43 @@ +--- +id: helm-tools +title: Required tools for deploying Pulsar Helm Chart +sidebar_label: "Required Tools" +original_id: helm-tools +--- + +Before deploying Pulsar to your Kubernetes cluster, there are some tools you must have installed locally. + +## kubectl + +kubectl is the tool that talks to the Kubernetes API. kubectl 1.14 or higher is required and it needs to be compatible with your cluster ([+/- 1 minor release from your cluster](https://kubernetes.io/docs/tasks/tools/install-kubectl/#before-you-begin)). + +To Install kubectl locally, follow the [Kubernetes documentation](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl). + +The server version of kubectl cannot be obtained until we connect to a cluster. + +## Helm + +Helm is the package manager for Kubernetes. The Apache Pulsar Helm Chart is tested and supported with Helm v3. + +### Get Helm + +You can get Helm from the project's [releases page](https://github.com/helm/helm/releases), or follow other options under the official documentation of [installing Helm](https://helm.sh/docs/intro/install/). + +### Next steps + +Once kubectl and Helm are configured, you can configure your [Kubernetes cluster](helm-prepare.md). + +## Additional information + +### Templates + +Templating in Helm is done through Golang's [text/template](https://golang.org/pkg/text/template/) and [sprig](https://godoc.org/github.com/Masterminds/sprig). + +For more information about how all the inner workings behave, check these documents: + +- [Functions and Pipelines](https://helm.sh/docs/chart_template_guide/functions_and_pipelines/) +- [Subcharts and Globals](https://helm.sh/docs/chart_template_guide/subcharts_and_globals/) + +### Tips and tricks + +For additional information on developing with Helm, check [tips and tricks section](https://helm.sh/docs/howto/charts_tips_and_tricks/) in the Helm repository. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/helm-upgrade.md b/site2/website/versioned_docs/version-2.9.x/helm-upgrade.md new file mode 100644 index 0000000000000..7d671e6bfb3c1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/helm-upgrade.md @@ -0,0 +1,43 @@ +--- +id: helm-upgrade +title: Upgrade Pulsar Helm release +sidebar_label: "Upgrade" +original_id: helm-upgrade +--- + +Before upgrading your Pulsar installation, you need to check the change log corresponding to the specific release you want to upgrade to and look for any release notes that might pertain to the new Pulsar helm chart version. + +We also recommend that you need to provide all values using the `helm upgrade --set key=value` syntax or the `-f values.yml` instead of using `--reuse-values`, because some of the current values might be deprecated. + +:::note + +You can retrieve your previous `--set` arguments cleanly, with `helm get values `. If you direct this into a file (`helm get values > pulsar.yml`), you can safely pass this file through `-f`, namely `helm upgrade apache/pulsar -f pulsar.yaml`. This safely replaces the behavior of `--reuse-values`. + +::: + +## Steps + +To upgrade Apache Pulsar to a newer version, follow these steps: + +1. Check the change log for the specific version you would like to upgrade to. +2. Go through [deployment documentation](helm-deploy.md) step by step. +3. Extract your previous `--set` arguments with the following command. + + ```bash + + helm get values > pulsar.yaml + + ``` + +4. Decide all the values you need to set. +5. Perform the upgrade, with all `--set` arguments extracted in step 4. + + ```bash + + helm upgrade apache/pulsar \ + --version \ + -f pulsar.yaml \ + --set ... + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-aerospike-sink.md b/site2/website/versioned_docs/version-2.9.x/io-aerospike-sink.md new file mode 100644 index 0000000000000..63d7338a3ba91 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-aerospike-sink.md @@ -0,0 +1,26 @@ +--- +id: io-aerospike-sink +title: Aerospike sink connector +sidebar_label: "Aerospike sink connector" +original_id: io-aerospike-sink +--- + +The Aerospike sink connector pulls messages from Pulsar topics to Aerospike clusters. + +## Configuration + +The configuration of the Aerospike sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `seedHosts` |String| true | No default value| The comma-separated list of one or more Aerospike cluster hosts.

    Each host can be specified as a valid IP address or hostname followed by an optional port number. | +| `keyspace` | String| true |No default value |The Aerospike namespace. | +| `columnName` | String | true| No default value|The Aerospike column name. | +|`userName`|String|false|NULL|The Aerospike username.| +|`password`|String|false|NULL|The Aerospike password.| +| `keySet` | String|false |NULL | The Aerospike set name. | +| `maxConcurrentRequests` |int| false | 100 | The maximum number of concurrent Aerospike transactions that a sink can open. | +| `timeoutMs` | int|false | 100 | This property controls `socketTimeout` and `totalTimeout` for Aerospike transactions. | +| `retries` | int|false | 1 |The maximum number of retries before aborting a write transaction to Aerospike. | diff --git a/site2/website/versioned_docs/version-2.9.x/io-canal-source.md b/site2/website/versioned_docs/version-2.9.x/io-canal-source.md new file mode 100644 index 0000000000000..d1fd43bb0f74e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-canal-source.md @@ -0,0 +1,235 @@ +--- +id: io-canal-source +title: Canal source connector +sidebar_label: "Canal source connector" +original_id: io-canal-source +--- + +The Canal source connector pulls messages from MySQL to Pulsar topics. + +## Configuration + +The configuration of Canal source connector has the following properties. + +### Property + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `username` | true | None | Canal server account (not MySQL).| +| `password` | true | None | Canal server password (not MySQL). | +|`destination`|true|None|Source destination that Canal source connector connects to. +| `singleHostname` | false | None | Canal server address.| +| `singlePort` | false | None | Canal server port.| +| `cluster` | true | false | Whether to enable cluster mode based on Canal server configuration or not.

  • true: **cluster** mode.
    If set to true, it talks to `zkServers` to figure out the actual database host.

  • false: **standalone** mode.
    If set to false, it connects to the database specified by `singleHostname` and `singlePort`.
  • | +| `zkServers` | true | None | Address and port of the Zookeeper that Canal source connector talks to figure out the actual database host.| +| `batchSize` | false | 1000 | Batch size to fetch from Canal. | + +### Example + +Before using the Canal connector, you can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "zkServers": "127.0.0.1:2181", + "batchSize": "5120", + "destination": "example", + "username": "", + "password": "", + "cluster": false, + "singleHostname": "127.0.0.1", + "singlePort": "11111", + } + + ``` + +* YAML + + You can create a YAML file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/resources/canal-mysql-source-config.yaml) below to your YAML file. + + ```yaml + + configs: + zkServers: "127.0.0.1:2181" + batchSize: 5120 + destination: "example" + username: "" + password: "" + cluster: false + singleHostname: "127.0.0.1" + singlePort: 11111 + + ``` + +## Usage + +Here is an example of storing MySQL data using the configuration file as above. + +1. Start a MySQL server. + + ```bash + + $ docker pull mysql:5.7 + $ docker run -d -it --rm --name pulsar-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=canal -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw mysql:5.7 + + ``` + +2. Create a configuration file `mysqld.cnf`. + + ```bash + + [mysqld] + pid-file = /var/run/mysqld/mysqld.pid + socket = /var/run/mysqld/mysqld.sock + datadir = /var/lib/mysql + #log-error = /var/log/mysql/error.log + # By default we only accept connections from localhost + #bind-address = 127.0.0.1 + # Disabling symbolic-links is recommended to prevent assorted security risks + symbolic-links=0 + log-bin=mysql-bin + binlog-format=ROW + server_id=1 + + ``` + +3. Copy the configuration file `mysqld.cnf` to MySQL server. + + ```bash + + $ docker cp mysqld.cnf pulsar-mysql:/etc/mysql/mysql.conf.d/ + + ``` + +4. Restart the MySQL server. + + ```bash + + $ docker restart pulsar-mysql + + ``` + +5. Create a test database in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal -e 'create database test;' + + ``` + +6. Start a Canal server and connect to MySQL server. + + ``` + + $ docker pull canal/canal-server:v1.1.2 + $ docker run -d -it --link pulsar-mysql -e canal.auto.scan=false -e canal.destinations=test -e canal.instance.master.address=pulsar-mysql:3306 -e canal.instance.dbUsername=root -e canal.instance.dbPassword=canal -e canal.instance.connectionCharset=UTF-8 -e canal.instance.tsdb.enable=true -e canal.instance.gtidon=false --name=pulsar-canal-server -p 8000:8000 -p 2222:2222 -p 11111:11111 -p 11112:11112 -m 4096m canal/canal-server:v1.1.2 + + ``` + +7. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:2.3.0 + $ docker run -d -it --link pulsar-canal-server -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:2.3.0 bin/pulsar standalone + + ``` + +8. Modify the configuration file `canal-mysql-source-config.yaml`. + + ```yaml + + configs: + zkServers: "" + batchSize: "5120" + destination: "test" + username: "" + password: "" + cluster: false + singleHostname: "pulsar-canal-server" + singlePort: "11111" + + ``` + +9. Create a consumer file `pulsar-client.py`. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', + subscription_name='my-sub') + + while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +10. Copy the configuration file `canal-mysql-source-config.yaml` and the consumer file `pulsar-client.py` to Pulsar server. + + ```bash + + $ docker cp canal-mysql-source-config.yaml pulsar-standalone:/pulsar/conf/ + $ docker cp pulsar-client.py pulsar-standalone:/pulsar/ + + ``` + +11. Download a Canal connector and start it. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.3.0/connectors/pulsar-io-canal-2.3.0.nar -P connectors + $ ./bin/pulsar-admin source localrun \ + --archive ./connectors/pulsar-io-canal-2.3.0.nar \ + --classname org.apache.pulsar.io.canal.CanalStringSource \ + --tenant public \ + --namespace default \ + --name canal \ + --destination-topic-name my-topic \ + --source-config-file /pulsar/conf/canal-mysql-source-config.yaml \ + --parallelism 1 + + ``` + +12. Consume data from MySQL. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + $ python pulsar-client.py + + ``` + +13. Open another window to log in MySQL server. + + ```bash + + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal + + ``` + +14. Create a table, and insert, delete, and update data in MySQL server. + + ```bash + + mysql> use test; + mysql> show tables; + mysql> CREATE TABLE IF NOT EXISTS `test_table`(`test_id` INT UNSIGNED AUTO_INCREMENT,`test_title` VARCHAR(100) NOT NULL, + `test_author` VARCHAR(40) NOT NULL, + `test_date` DATE,PRIMARY KEY ( `test_id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; + mysql> INSERT INTO test_table (test_title, test_author, test_date) VALUES("a", "b", NOW()); + mysql> UPDATE test_table SET test_title='c' WHERE test_title='a'; + mysql> DELETE FROM test_table WHERE test_title='c'; + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-cassandra-sink.md b/site2/website/versioned_docs/version-2.9.x/io-cassandra-sink.md new file mode 100644 index 0000000000000..b27a754f49e18 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-cassandra-sink.md @@ -0,0 +1,57 @@ +--- +id: io-cassandra-sink +title: Cassandra sink connector +sidebar_label: "Cassandra sink connector" +original_id: io-cassandra-sink +--- + +The Cassandra sink connector pulls messages from Pulsar topics to Cassandra clusters. + +## Configuration + +The configuration of the Cassandra sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `roots` | String|true | " " (empty string) | A comma-separated list of Cassandra hosts to connect to.| +| `keyspace` | String|true| " " (empty string)| The key space used for writing pulsar messages.

    **Note: `keyspace` should be created prior to a Cassandra sink.**| +| `keyname` | String|true| " " (empty string)| The key name of the Cassandra column family.

    The column is used for storing Pulsar message keys.

    If a Pulsar message doesn't have any key associated, the message value is used as the key. | +| `columnFamily` | String|true| " " (empty string)| The Cassandra column family name.

    **Note: `columnFamily` should be created prior to a Cassandra sink.**| +| `columnName` | String|true| " " (empty string) | The column name of the Cassandra column family.

    The column is used for storing Pulsar message values. | + +### Example + +Before using the Cassandra sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + + ``` + +* YAML + + ``` + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +## Usage + +For more information about **how to connect Pulsar with Cassandra**, see [here](io-quickstart.md#connect-pulsar-to-apache-cassandra). diff --git a/site2/website/versioned_docs/version-2.9.x/io-cdc-debezium.md b/site2/website/versioned_docs/version-2.9.x/io-cdc-debezium.md new file mode 100644 index 0000000000000..293ccf2b35e8a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-cdc-debezium.md @@ -0,0 +1,543 @@ +--- +id: io-cdc-debezium +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-cdc-debezium +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL for the offset topic used in Debezium. You can use the `bin/pulsar-admin --admin-url http://pulsar:8080 sources localrun --source-config-file configs/pg-pulsar-config.yaml` command to point to the target Pulsar cluster.| +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + + + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-postgres:0.8 + $ docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +* JSON + + ```json + + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.10 + mongodb.hosts: "rs0/mongodb:27017", + mongodb.name: "dbserver1", + mongodb.user: "debezium", + mongodb.password: "dbz", + mongodb.task.id: "1", + database.whitelist: "inventory", + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-cdc.md b/site2/website/versioned_docs/version-2.9.x/io-cdc.md new file mode 100644 index 0000000000000..e6e662884826d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-cdc.md @@ -0,0 +1,26 @@ +--- +id: io-cdc +title: CDC connector +sidebar_label: "CDC connector" +original_id: io-cdc +--- + +CDC source connectors capture log changes of databases (such as MySQL, MongoDB, and PostgreSQL) into Pulsar. + +> CDC source connectors are built on top of [Canal](https://github.com/alibaba/canal) and [Debezium](https://debezium.io/) and store all data into Pulsar cluster in a persistent, replicated, and partitioned way. + +Currently, Pulsar has the following CDC connectors. + +Name|Java Class +|---|--- +[Canal source connector](io-canal-source.md)|[org.apache.pulsar.io.canal.CanalStringSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) +[Debezium source connector](io-cdc-debezium.md)|
  • [org.apache.pulsar.io.debezium.DebeziumSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/core/src/main/java/org/apache/pulsar/io/debezium/DebeziumSource.java)
  • [org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java)
  • [org.apache.pulsar.io.debezium.postgres.DebeziumPostgresSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java)
  • + +For more information about Canal and Debezium, see the information below. + +Subject | Reference +|---|--- +How to use Canal source connector with MySQL|[Canal guide](https://github.com/alibaba/canal/wiki) +How does Canal work | [Canal tutorial](https://github.com/alibaba/canal/wiki) +How to use Debezium source connector with MySQL | [Debezium guide](https://debezium.io/docs/connectors/mysql/) +How does Debezium work | [Debezium tutorial](https://debezium.io/docs/tutorial/) diff --git a/site2/website/versioned_docs/version-2.9.x/io-cli.md b/site2/website/versioned_docs/version-2.9.x/io-cli.md new file mode 100644 index 0000000000000..3d54bb61875e2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-cli.md @@ -0,0 +1,658 @@ +--- +id: io-cli +title: Connector Admin CLI +sidebar_label: "CLI" +original_id: io-cli +--- + +The `pulsar-admin` tool helps you manage Pulsar connectors. + +## `sources` + +An interface for managing Pulsar IO sources (ingress data into Pulsar). + +```bash + +$ pulsar-admin sources subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sources` + +* `reload` + + +### `create` + +Submit a Pulsar IO source connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `update` + +Update a already submitted Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. The `source-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--tenant` | The source's tenant. +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + + +### `delete` + +Delete a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `get` + +Get the information about a Pulsar IO source connector. + +#### Usage + +```bash + +$ pulsar-admin sources get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `status` + +Check the current status of a Pulsar Source. + +#### Usage + +```bash + +$ pulsar-admin sources status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `list` + +List all running Pulsar IO source connectors. + +#### Usage + +```bash + +$ pulsar-admin sources list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `stop` + +Stop a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `start` + +Start a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `restart` + +Restart a source instance. + +#### Usage + +```bash + +$ pulsar-admin sources restart options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `localrun` + +Run a Pulsar IO source connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the Source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The source's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--deserialization-classname`|The SerDe classname for the source. +|`--destination-topic-name`|The Pulsar topic to which data is sent. +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +|`--name`|The source’s name.| +|`--namespace`|The source’s namespace.| +|`--parallelism`|The source’s parallelism factor, that is, the number of source instances to run).| +|`--processing-guarantees` | The processing guarantees (also named as delivery semantics) applied to the source. A source connector receives messages from external system and writes messages to a Pulsar topic. The `--processing-guarantees` is used to ensure the processing guarantees for writing messages to the Pulsar topic.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +|`--source-config`|Source config key/values. +|`--source-config-file`|The path to a YAML config file specifying the source’s configuration. +|`--source-type`|The source's connector provider. +|`--tenant`|The source’s tenant. +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +|`--use-tls`|Use tls connection.
    **Default value: false**. +|`--producer-config`| The custom producer configuration (as a JSON string). + +### `available-sources` + +Get the list of Pulsar IO connector sources supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sources available-sources + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sources reload + +``` + +## `sinks` + +An interface for managing Pulsar IO sinks (egress data from Pulsar). + +```bash + +$ pulsar-admin sinks subcommands + +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sinks` + +* `reload` + + +### `create` + +Submit a Pulsar IO sink connector to run in a Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks create options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). + +### `update` + +Update a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks update options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + +### `delete` + +Delete a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks delete options + +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `get` + +Get the information about a Pulsar IO sink connector. + +#### Usage + +```bash + +$ pulsar-admin sinks get options + +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `status` + +Check the current status of a Pulsar sink. + +#### Usage + +```bash + +$ pulsar-admin sinks status options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink ID.
    If `instance-id` is not provided, Pulsar gets status of all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `list` + +List all running Pulsar IO sink connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks list options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `stop` + +Stop a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks stop options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `start` + +Start a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks start options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `restart` + +Restart a sink instance. + +#### Usage + +```bash + +$ pulsar-admin sinks restart options + +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `localrun` + +Run a Pulsar IO sink connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks localrun options + +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The sink's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime). +| `--custom-schema-inputs` | The map of input topics to Schema types or class names (as a JSON string). +| `--max-redeliver-count` | Maximum number of times that a message is redelivered before being sent to the dead letter queue. +| `--dead-letter-topic` | Name of the dead letter topic where the failing messages are sent. +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +| `-i`, `--inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name`|The sink’s name.| +|`--namespace`|The sink’s namespace.| +|`--parallelism`|The sink’s parallelism factor, that is, the number of sink instances to run).| +|`--processing-guarantees`|The processing guarantees (also known as delivery semantics) applied to the sink. The `--processing-guarantees` implementation in Pulsar also relies on sink implementation.
    The available values are ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--retain-ordering` | Sink consumes and sinks messages in order. +|`--sink-config`|sink config key/values. +|`--sink-config-file`|The path to a YAML config file specifying the sink’s configuration. +|`--sink-type`|The sink's connector provider. +|`--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +|`--tenant`|The sink’s tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--negative-ack-redelivery-delay-ms` | The negatively-acknowledged message redelivery delay in milliseconds. | +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +|`--use-tls`|Use tls connection.
    **Default value: false**. + +### `available-sinks` + +Get the list of Pulsar IO connector sinks supported by Pulsar cluster. + +#### Usage + +```bash + +$ pulsar-admin sinks available-sinks + +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash + +$ pulsar-admin sinks reload + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-connectors.md b/site2/website/versioned_docs/version-2.9.x/io-connectors.md new file mode 100644 index 0000000000000..957a02a5a1964 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-connectors.md @@ -0,0 +1,249 @@ +--- +id: io-connectors +title: Built-in connector +sidebar_label: "Built-in connector" +original_id: io-connectors +--- + +Pulsar distribution includes a set of common connectors that have been packaged and tested with the rest of Apache Pulsar. These connectors import and export data from some of the most commonly used data systems. + +Using any of these connectors is as easy as writing a simple connector and running the connector locally or submitting the connector to a Pulsar Functions cluster. + +## Source connector + +Pulsar has various source connectors, which are sorted alphabetically as below. + +### Canal + +* [Configuration](io-canal-source.md#configuration) + +* [Example](io-canal-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) + + +### Debezium MySQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mysql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java) + +### Debezium PostgreSQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java) + +### Debezium MongoDB + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mongodb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/java/org/apache/pulsar/io/debezium/mongodb/DebeziumMongoDbSource.java) + +### Debezium Oracle + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-oracle) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/oracle/src/main/java/org/apache/pulsar/io/debezium/oracle/DebeziumOracleSource.java) + +### Debezium Microsoft SQL Server + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-microsoft-sql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mssql/src/main/java/org/apache/pulsar/io/debezium/mssql/DebeziumMsSqlSource.java) + + +### DynamoDB + +* [Configuration](io-dynamodb-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/dynamodb/src/main/java/org/apache/pulsar/io/dynamodb/DynamoDBSource.java) + +### File + +* [Configuration](io-file-source.md#configuration) + +* [Example](io-file-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/file/src/main/java/org/apache/pulsar/io/file/FileSource.java) + +### Flume + +* [Configuration](io-flume-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/FlumeConnector.java) + +### Twitter firehose + +* [Configuration](io-twitter-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java) + +### Kafka + +* [Configuration](io-kafka-source.md#configuration) + +* [Example](io-kafka-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java) + +### Kinesis + +* [Configuration](io-kinesis-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSource.java) + +### Netty + +* [Configuration](io-netty-source.md#configuration) + +* [Example of TCP](io-netty-source.md#tcp) + +* [Example of HTTP](io-netty-source.md#http) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/netty/src/main/java/org/apache/pulsar/io/netty/NettySource.java) + +### NSQ + +* [Configuration](io-nsq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/nsq/src/main/java/org/apache/pulsar/io/nsq/NSQSource.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java) + +## Sink connector + +Pulsar has various sink connectors, which are sorted alphabetically as below. + +### Aerospike + +* [Configuration](io-aerospike-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java) + +### Cassandra + +* [Configuration](io-cassandra-sink.md#configuration) + +* [Example](io-cassandra-sink.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java) + +### ElasticSearch + +* [Configuration](io-elasticsearch-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/elastic-search/src/main/java/org/apache/pulsar/io/elasticsearch/ElasticSearchSink.java) + +### Flume + +* [Configuration](io-flume-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/sink/StringSink.java) + +### HBase + +* [Configuration](io-hbase-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hbase/src/main/java/org/apache/pulsar/io/hbase/HbaseAbstractConfig.java) + +### HDFS2 + +* [Configuration](io-hdfs2-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs2/src/main/java/org/apache/pulsar/io/hdfs2/AbstractHdfsConnector.java) + +### HDFS3 + +* [Configuration](io-hdfs3-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs3/src/main/java/org/apache/pulsar/io/hdfs3/AbstractHdfsConnector.java) + +### InfluxDB + +* [Configuration](io-influxdb-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/influxdb/src/main/java/org/apache/pulsar/io/influxdb/InfluxDBGenericRecordSink.java) + +### JDBC ClickHouse + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-clickhouse) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/clickhouse/src/main/java/org/apache/pulsar/io/jdbc/ClickHouseJdbcAutoSchemaSink.java) + +### JDBC MariaDB + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-mariadb) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/mariadb/src/main/java/org/apache/pulsar/io/jdbc/MariadbJdbcAutoSchemaSink.java) + +### JDBC PostgreSQL + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/postgres/src/main/java/org/apache/pulsar/io/jdbc/PostgresJdbcAutoSchemaSink.java) + +### JDBC SQLite + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#example-for-sqlite) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/sqlite/src/main/java/org/apache/pulsar/io/jdbc/SqliteJdbcAutoSchemaSink.java) + +### Kafka + +* [Configuration](io-kafka-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java) + +### Kinesis + +* [Configuration](io-kinesis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSink.java) + +### MongoDB + +* [Configuration](io-mongo-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/mongo/src/main/java/org/apache/pulsar/io/mongodb/MongoSink.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSink.java) + +### Redis + +* [Configuration](io-redis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/redis/src/main/java/org/apache/pulsar/io/redis/RedisAbstractConfig.java) + +### Solr + +* [Configuration](io-solr-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/solr/src/main/java/org/apache/pulsar/io/solr/SolrSinkConfig.java) + diff --git a/site2/website/versioned_docs/version-2.9.x/io-debezium-source.md b/site2/website/versioned_docs/version-2.9.x/io-debezium-source.md new file mode 100644 index 0000000000000..f739a4cdc4903 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-debezium-source.md @@ -0,0 +1,798 @@ +--- +id: io-debezium-source +title: Debezium source connector +sidebar_label: "Debezium source connector" +original_id: io-debezium-source +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `json-with-envelope` | false | false | Present the message only consist of payload. +| `database.history.pulsar.reader.config` | false | null | The configs of the reader for the database schema history topic, in the form of a JSON string with key-value pairs.
    **Note:** This property is only available in 2.9.4 and later versions. | +| `offset.storage.reader.config` | false | null | The configs of the reader for the kafka connector offsets topic, in the form of a JSON string with key-value pairs.
    **Note:** This property is only available in 2.9.4 and later versions.| + +### Converter Options + +1. org.apache.kafka.connect.json.JsonConverter + +This config `json-with-envelope` is valid only for the JsonConverter. It's default value is false, the consumer use the schema ` +Schema.KeyValue(Schema.AUTO_CONSUME(), Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, +and the message only consist of payload. + +If the config `json-with-envelope` value is true, the consumer use the schema +`Schema.KeyValue(Schema.BYTES, Schema.BYTES`, the message consist of schema and payload. + +2. org.apache.pulsar.kafka.shade.io.confluent.connect.avro.AvroConverter + +If users select the AvroConverter, then the pulsar consumer should use the schema `Schema.KeyValue(Schema.AUTO_CONSUME(), +Schema.AUTO_CONSUME(), KeyValueEncodingType.SEPARATED)`, and the message consist of payload. + +### MongoDB Configuration +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + +### Customize the Reader config for the metadata topics + +:::note + +The customization is only available in 2.9.4 and later versions. + +::: + +The Debezium Connector exposes `database.history.pulsar.reader.config` and `offset.storage.reader.config` to configure the reader of database schema history topic and the Kafka connector offsets topic. For example, it can be used to configure the subscription name and other reader configurations. You can find the available configurations at [ReaderConfigurationData](https://github.com/apache/pulsar/blob/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/conf/ReaderConfigurationData.java). + +For example, to configure the subscription name for both Readers, you can add the following configuration: +* JSON + + ```json + { + "configs": { + "database.history.pulsar.reader.config": "{\"subscriptionName\":\"history-reader\"}", + "offset.storage.reader.config": "{\"subscriptionName\":\"offset-reader\"}", + } + } + ``` + +* YAML + + ```yaml + configs: + database.history.pulsar.reader.config: "{\"subscriptionName\":\"history-reader\"}" + offset.storage.reader.config: "{\"subscriptionName\":\"offset-reader\"}" + ``` + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "offset.storage.topic": "offset-topic" + } + + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-@pulsar:version@.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MySQL client in docker. + + ```bash + + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "changeme", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "plugin.name": "pgoutput", + "schema.whitelist": "public", + "table.whitelist": "public.users", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for postgres version 10+, official docker image: postgres:<10+> + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "changeme" + database.dbname: "postgres" + database.server.name: "dbserver1" + plugin.name: "pgoutput" + schema.whitelist: "public" + table.whitelist: "public.users" + + ## PULSAR_SERVICE_URL_CONFIG + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +Notice that `pgoutput` is a standard plugin of Postgres introduced in version 10 - [see Postgres architecture docu](https://www.postgresql.org/docs/10/logical-replication-architecture.html). You don't need to install anything, just make sure the WAL level is set to `logical` (see docker command below and [Postgres docu](https://www.postgresql.org/docs/current/runtime-config-wal.html)). + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=changeme \ + postgres:13.3 -c wal_level=logical + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-@pulsar:version@.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "changeme","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "public","table.whitelist": "public.users","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + + ``` + +4. Subscribe the topic _sub-users_ for the _public.users_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-users" public/default/dbserver1.public.users -n 0 + + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + + $ docker exec -it pulsar-postgresql /bin/bash + + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to create sample data in the table _users_. + + ``` + + psql -U postgres -h localhost -p 5432 + Password for user postgres: + + CREATE TABLE users( + id BIGINT GENERATED ALWAYS AS IDENTITY, PRIMARY KEY(id), + hash_firstname TEXT NOT NULL, + hash_lastname TEXT NOT NULL, + gender VARCHAR(6) NOT NULL CHECK (gender IN ('male', 'female')) + ); + + INSERT INTO users(hash_firstname, hash_lastname, gender) + SELECT md5(RANDOM()::TEXT), md5(RANDOM()::TEXT), CASE WHEN RANDOM() < 0.5 THEN 'male' ELSE 'female' END FROM generate_series(1, 100); + + postgres=# select * from users; + + id | hash_firstname | hash_lastname | gender + -------+----------------------------------+----------------------------------+-------- + 1 | 02bf7880eb489edc624ba637f5ab42bd | 3e742c2cc4217d8e3382cc251415b2fb | female + 2 | dd07064326bb9119189032316158f064 | 9c0e938f9eddbd5200ba348965afbc61 | male + 3 | 2c5316fdd9d6595c1cceb70eed12e80c | 8a93d7d8f9d76acfaaa625c82a03ea8b | female + 4 | 3dfa3b4f70d8cd2155567210e5043d2b | 32c156bc28f7f03ab5d28e2588a3dc19 | female + + + postgres=# UPDATE users SET hash_firstname='maxim' WHERE id=1; + UPDATE 1 + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"before":null,"after":{"id":1,"hash_firstname":"maxim","hash_lastname":"292113d30a3ccee0e19733dd7f88b258","gender":"male"},"source:{"version":"1.0.0.Final","connector":"postgresql","name":"foobar","ts_ms":1624045862644,"snapshot":"false","db":"postgres","schema":"public","table":"users","txId":595,"lsn":24419784,"xmin":null},"op":"u","ts_ms":1624045862648} + ...many more + + ``` + +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" + } + + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-mongodb:0.10 + mongodb.hosts: "rs0/mongodb:27017", + mongodb.name: "dbserver1", + mongodb.user: "debezium", + mongodb.password: "dbz", + mongodb.task.id: "1", + database.whitelist: "inventory", + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + + ``` + + Use the following commands to initialize the data. + + ``` bash + + ./usr/local/bin/init-inventory.sh + + ``` + + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-@pulsar:version@.nar`. + + ```bash + + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-@pulsar:version@.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650"}' + + ``` + + :::note + + Currently, the destination topic (specified by the `destination-topic-name` option ) is a required configuration but it is not used for the Debezium connector to save data. The Debezium connector saves data in the following 4 types of topics: + + - One topic named with the database server name ( `database.server.name`) for storing the database metadata messages, such as `public/default/database.server.name`. + - One topic (`database.history.pulsar.topic`) for storing the database history information. The connector writes and recovers DDL statements on this topic. + - One topic (`offset.storage.topic`) for storing the offset metadata messages. The connector saves the last successfully-committed offsets on this topic. + - One per-table topic. The connector writes change events for all operations that occur in a table to a single Pulsar topic that is specific to that table. + + If the automatic topic creation is disabled on your broker, you need to manually create the above 4 types of topics and the destination topic. + + ::: + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + + ``` + +5. Start a MongoDB client in docker. + + ```bash + + $ docker exec -it pulsar-mongodb /bin/bash + + ``` + +6. A MongoDB client pops out. + + ```bash + + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + + ``` + +## Example of Oracle + +### Packaging + +Oracle connector does not include Oracle JDBC driver and you need to package it with the connector. +Major reasons for not including the drivers are the variety of versions and Oracle licensing. It is recommended to use the driver provided with your Oracle DB installation, or you can [download](https://www.oracle.com/database/technologies/appdev/jdbc.html) one. +Integration test have an [example](https://github.com/apache/pulsar/blob/e2bc52d40450fa00af258c4432a5b71d50a5c6e0/tests/docker-images/latest-version-image/Dockerfile#L110-L122) of packaging the driver into the connector nar file. + +### Configuration + +Debezium [requires](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-overview) Oracle DB with LogMiner or XStream API enabled. +Supported options and steps for enabling them vary from version to version of Oracle DB. +Steps outlined in the [documentation](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-overview) and used in the [integration test](https://github.com/apache/pulsar/blob/master/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io/sources/debezium/DebeziumOracleDbSourceTester.java) may or may not work for the version and edition of Oracle DB you are using. +Please refer to the [documentation for Oracle DB](https://docs.oracle.com/en/database/oracle/oracle-database/) as needed. + +Similarly to other connectors, you can use JSON or YAMl to configure the connector. +Using yaml as an example, you can create a debezium-oracle-source-config.yaml file like: + +* JSON + +```json + +{ + "database.hostname": "localhost", + "database.port": "1521", + "database.user": "dbzuser", + "database.password": "dbz", + "database.dbname": "XE", + "database.server.name": "XE", + "schema.exclude.list": "system,dbzuser", + "snapshot.mode": "initial", + "topic.namespace": "public/default", + "task.class": "io.debezium.connector.oracle.OracleConnectorTask", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "typeClassName": "org.apache.pulsar.common.schema.KeyValue", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.tcpKeepAlive": "true", + "decimal.handling.mode": "double", + "database.history.pulsar.topic": "debezium-oracle-source-history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" +} + +``` + +* YAML + +```yaml + +tenant: "public" +namespace: "default" +name: "debezium-oracle-source" +topicName: "debezium-oracle-topic" +parallelism: 1 + +className: "org.apache.pulsar.io.debezium.oracle.DebeziumOracleSource" +database.dbname: "XE" + +configs: + database.hostname: "localhost" + database.port: "1521" + database.user: "dbzuser" + database.password: "dbz" + database.dbname: "XE" + database.server.name: "XE" + schema.exclude.list: "system,dbzuser" + snapshot.mode: "initial" + topic.namespace: "public/default" + task.class: "io.debezium.connector.oracle.OracleConnectorTask" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + key.converter: "org.apache.kafka.connect.json.JsonConverter" + typeClassName: "org.apache.pulsar.common.schema.KeyValue" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.tcpKeepAlive: "true" + decimal.handling.mode: "double" + database.history.pulsar.topic: "debezium-oracle-source-history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + +``` + +For the full list of configuration properties supported by Debezium, see [Debezium Connector for Oracle](https://debezium.io/documentation/reference/1.5/connectors/oracle.html#oracle-connector-properties). + +## Example of Microsoft SQL + +### Configuration + +Debezium [requires](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#sqlserver-overview) SQL Server with CDC enabled. +Steps outlined in the [documentation](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#setting-up-sqlserver) and used in the [integration test](https://github.com/apache/pulsar/blob/master/tests/integration/src/test/java/org/apache/pulsar/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io/sources/debezium/DebeziumMsSqlSourceTester.java). +For more information, see [Enable and disable change data capture in Microsoft SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-data-capture-sql-server). + +Similarly to other connectors, you can use JSON or YAMl to configure the connector. + +* JSON + +```json + +{ + "database.hostname": "localhost", + "database.port": "1433", + "database.user": "sa", + "database.password": "MyP@ssw0rd!", + "database.dbname": "MyTestDB", + "database.server.name": "mssql", + "snapshot.mode": "schema_only", + "topic.namespace": "public/default", + "task.class": "io.debezium.connector.sqlserver.SqlServerConnectorTask", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "typeClassName": "org.apache.pulsar.common.schema.KeyValue", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.tcpKeepAlive": "true", + "decimal.handling.mode": "double", + "database.history.pulsar.topic": "debezium-mssql-source-history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650" +} + +``` + +* YAML + +```yaml + +tenant: "public" +namespace: "default" +name: "debezium-mssql-source" +topicName: "debezium-mssql-topic" +parallelism: 1 + +className: "org.apache.pulsar.io.debezium.mssql.DebeziumMsSqlSource" +database.dbname: "mssql" + +configs: + database.hostname: "localhost" + database.port: "1433" + database.user: "sa" + database.password: "MyP@ssw0rd!" + database.dbname: "MyTestDB" + database.server.name: "mssql" + snapshot.mode: "schema_only" + topic.namespace: "public/default" + task.class: "io.debezium.connector.sqlserver.SqlServerConnectorTask" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + key.converter: "org.apache.kafka.connect.json.JsonConverter" + typeClassName: "org.apache.pulsar.common.schema.KeyValue" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.tcpKeepAlive: "true" + decimal.handling.mode: "double" + database.history.pulsar.topic: "debezium-mssql-source-history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + +``` + +For the full list of configuration properties supported by Debezium, see [Debezium Connector for MS SQL](https://debezium.io/documentation/reference/1.5/connectors/sqlserver.html#sqlserver-connector-properties). + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt + +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) + +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt + +max.queue.size= + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-debug.md b/site2/website/versioned_docs/version-2.9.x/io-debug.md new file mode 100644 index 0000000000000..844e101d00d2a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-debug.md @@ -0,0 +1,407 @@ +--- +id: io-debug +title: How to debug Pulsar connectors +sidebar_label: "Debug" +original_id: io-debug +--- +This guide explains how to debug connectors in localrun or cluster mode and gives a debugging checklist. +To better demonstrate how to debug Pulsar connectors, here takes a Mongo sink connector as an example. + +**Deploy a Mongo sink environment** +1. Start a Mongo service. + + ```bash + + docker pull mongo:4 + docker run -d -p 27017:27017 --name pulsar-mongo -v $PWD/data:/data/db mongo:4 + + ``` + +2. Create a DB and a collection. + + ```bash + + docker exec -it pulsar-mongo /bin/bash + mongo + > use pulsar + > db.createCollection('messages') + > exit + + ``` + +3. Start Pulsar standalone. + + ```bash + + docker pull apachepulsar/pulsar:2.4.0 + docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --link pulsar-mongo --name pulsar-mongo-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + + ``` + +4. Configure the Mongo sink with the `mongo-sink-config.yaml` file. + + ```bash + + configs: + mongoUri: "mongodb://pulsar-mongo:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + + ```bash + + docker cp mongo-sink-config.yaml pulsar-mongo-standalone:/pulsar/ + + ``` + +5. Download the Mongo sink nar package. + + ```bash + + docker exec -it pulsar-mongo-standalone /bin/bash + curl -O http://apache.01link.hk/pulsar/pulsar-2.4.0/connectors/pulsar-io-mongo-2.4.0.nar + + ``` + +## Debug in localrun mode +Start the Mongo sink in localrun mode using the `localrun` command. +:::tip + +For more information about the `localrun` command, see [`localrun`](reference-connector-admin.md/#localrun-1). + +::: + +```bash + +./bin/pulsar-admin sinks localrun \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public --namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### Use connector log +Use one of the following methods to get a connector log in localrun mode: +* After executing the `localrun` command, the **log is automatically printed on the console**. +* The log is located at: + + ```bash + + logs/functions/tenant/namespace/function-name/function-name-instance-id.log + + ``` + + **Example** + + The path of the Mongo sink connector is: + + ```bash + + logs/functions/public/default/pulsar-mongo-sink/pulsar-mongo-sink-0.log + + ``` + +To clearly explain the log information, here breaks down the large block of information into small blocks and add descriptions for each block. +* This piece of log information shows the storage path of the nar package after decompression. + + ``` + + 08:21:54.132 [main] INFO org.apache.pulsar.common.nar.NarClassLoader - Created class loader with paths: [file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/, file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/, + + ``` + + :::tip + + If `class cannot be found` exception is thrown, check whether the nar file is decompressed in the folder `file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/` or not. + + ::: + +* This piece of log information illustrates the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, resources, and so on, which can be used to **check whether the Mongo sink connector is configured correctly or not**. + + ```bash + + 08:21:55.390 [main] INFO org.apache.pulsar.functions.runtime.ThreadRuntime - ThreadContainer starting function with instance config InstanceConfig(instanceId=0, functionId=853d60a1-0c48-44d5-9a5c-6917386476b2, functionVersion=c2ce1458-b69e-4175-88c0-a0a856a2be8c, functionDetails=tenant: "public" + namespace: "default" + name: "pulsar-mongo-sink" + className: "org.apache.pulsar.functions.api.utils.IdentityFunction" + autoAck: true + parallelism: 1 + source { + typeClassName: "[B" + inputSpecs { + key: "test-mongo" + value { + } + } + cleanupSubscription: true + } + sink { + className: "org.apache.pulsar.io.mongodb.MongoSink" + configs: "{\"mongoUri\":\"mongodb://pulsar-mongo:27017\",\"database\":\"pulsar\",\"collection\":\"messages\",\"batchSize\":2,\"batchTimeMs\":500}" + typeClassName: "[B" + } + resources { + cpu: 1.0 + ram: 1073741824 + disk: 10737418240 + } + componentType: SINK + , maxBufferedTuples=1024, functionAuthenticationSpec=null, port=38459, clusterName=local) + + ``` + +* This piece of log information demonstrates the status of the connections to Mongo and configuration information. + + ```bash + + 08:21:56.231 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.connection - Opened connection [connectionId{localValue:1, serverValue:8}] to pulsar-mongo:27017 + 08:21:56.326 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.cluster - Monitor thread successfully connected to server with description ServerDescription{address=pulsar-mongo:27017, type=STANDALONE, state=CONNECTED, ok=true, version=ServerVersion{versionList=[4, 2, 0]}, minWireVersion=0, maxWireVersion=8, maxDocumentSize=16777216, logicalSessionTimeoutMinutes=30, roundTripTimeNanos=89058800} + + ``` + +* This piece of log information explains the configuration of consumers and clients, including the topic name, subscription name, subscription type, and so on. + + ```bash + + 08:21:56.719 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Starting Pulsar consumer status recorder with config: { + "topicNames" : [ "test-mongo" ], + "topicsPattern" : null, + "subscriptionName" : "public/default/pulsar-mongo-sink", + "subscriptionType" : "Shared", + "receiverQueueSize" : 1000, + "acknowledgementsGroupTimeMicros" : 100000, + "negativeAckRedeliveryDelayMicros" : 60000000, + "maxTotalReceiverQueueSizeAcrossPartitions" : 50000, + "consumerName" : null, + "ackTimeoutMillis" : 0, + "tickDurationMillis" : 1000, + "priorityLevel" : 0, + "cryptoFailureAction" : "CONSUME", + "properties" : { + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink", + "instance_id" : "0" + }, + "readCompacted" : false, + "subscriptionInitialPosition" : "Latest", + "patternAutoDiscoveryPeriod" : 1, + "regexSubscriptionMode" : "PersistentOnly", + "deadLetterPolicy" : null, + "autoUpdatePartitions" : true, + "replicateSubscriptionState" : false, + "resetIncludeHead" : false + } + 08:21:56.726 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Pulsar client config: { + "serviceUrl" : "pulsar://localhost:6650", + "authPluginClassName" : null, + "authParams" : null, + "operationTimeoutMs" : 30000, + "statsIntervalSeconds" : 60, + "numIoThreads" : 1, + "numListenerThreads" : 1, + "connectionsPerBroker" : 1, + "useTcpNoDelay" : true, + "useTls" : false, + "tlsTrustCertsFilePath" : null, + "tlsAllowInsecureConnection" : false, + "tlsHostnameVerificationEnable" : false, + "concurrentLookupRequest" : 5000, + "maxLookupRequest" : 50000, + "maxNumberOfRejectedRequestPerConnection" : 50, + "keepAliveIntervalSeconds" : 30, + "connectionTimeoutMs" : 10000, + "requestTimeoutMs" : 60000, + "defaultBackoffIntervalNanos" : 100000000, + "maxBackoffIntervalNanos" : 30000000000 + } + + ``` + +## Debug in cluster mode +You can use the following methods to debug a connector in cluster mode: +* [Use connector log](#use-connector-log) +* [Use admin CLI](#use-admin-cli) +### Use connector log +In cluster mode, multiple connectors can run on a worker. To find the log path of a specified connector, use the `workerId` to locate the connector log. +### Use admin CLI +Pulsar admin CLI helps you debug Pulsar connectors with the following subcommands: +* [`get`](#get) + +* [`status`](#status) +* [`topics stats`](#topics-stats) + +**Create a Mongo sink** + +```bash + +./bin/pulsar-admin sinks create \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public \ +--namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 + +``` + +### `get` +Use the `get` command to get the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, and so on. + +```bash + +./bin/pulsar-admin sinks get --tenant public --namespace default --name pulsar-mongo-sink +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-mongo-sink", + "className": "org.apache.pulsar.io.mongodb.MongoSink", + "inputSpecs": { + "test-mongo": { + "isRegexPattern": false + } + }, + "configs": { + "mongoUri": "mongodb://pulsar-mongo:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": 2.0, + "batchTimeMs": 500.0 + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +:::tip + +For more information about the `get` command, see [`get`](reference-connector-admin.md/#get-1). + +::: + +### `status` +Use the `status` command to get the current status about the Mongo sink connector, such as the number of instance, the number of running instance, instanceId, workerId and so on. + +```bash + +./bin/pulsar-admin sinks status +--tenant public \ +--namespace default \ +--name pulsar-mongo-sink +{ +"numInstances" : 1, +"numRunning" : 1, +"instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-5d202832fd18-8080" + } +} ] +} + +``` + +:::tip + +For more information about the `status` command, see [`status`](reference-connector-admin.md/#stauts-1). +If there are multiple connectors running on a worker, `workerId` can locate the worker on which the specified connector is running. + +::: + +### `topics stats` +Use the `topics stats` command to get the stats for a topic and its connected producer and consumer, such as whether the topic has received messages or not, whether there is a backlog of messages or not, the available permits and other key information. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +```bash + +./bin/pulsar-admin topics stats test-mongo +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "averageMsgSize" : 0.0, + "storageSize" : 1, + "publishers" : [ ], + "subscriptions" : { + "public/default/pulsar-mongo-sink" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "msgBacklog" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Shared", + "msgRateExpired" : 0.0, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "consumerName" : "dffdd", + "availablePermits" : 999, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "metadata" : { + "instance_id" : "0", + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink" + }, + "connectedSince" : "2019-08-26T08:48:07.582Z", + "clientVersion" : "2.4.0", + "address" : "/172.17.0.3:57790" + } ], + "isReplicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled" +} + +``` + +:::tip + +For more information about the `topic stats` command, see [`topic stats`](http://pulsar.apache.org/docs/en/pulsar-admin/#stats-1). + +::: + +## Checklist +This checklist indicates the major areas to check when you debug connectors. It is a reminder of what to look for to ensure a thorough review and an evaluation tool to get the status of connectors. +* Does Pulsar start successfully? + +* Does the external service run normally? + +* Is the nar package complete? + +* Is the connector configuration file correct? + +* In localrun mode, run a connector and check the printed information (connector log) on the console. + +* In cluster mode: + + * Use the `get` command to get the basic information. + + * Use the `status` command to get the current status. + * Use the `topics stats` command to get the stats for a specified topic and its connected producers and consumers. + + * Check the connector log. +* Enter into the external system and verify the result. diff --git a/site2/website/versioned_docs/version-2.9.x/io-develop.md b/site2/website/versioned_docs/version-2.9.x/io-develop.md new file mode 100644 index 0000000000000..43637ac513029 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-develop.md @@ -0,0 +1,420 @@ +--- +id: io-develop +title: How to develop Pulsar connectors +sidebar_label: "Develop" +original_id: io-develop +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to develop Pulsar connectors to move data +between Pulsar and other systems. + +Pulsar connectors are special [Pulsar Functions](functions-overview.md), so creating a Pulsar connector is similar to creating a Pulsar function. + +Pulsar connectors come in two types: + +| Type | Description | Example +|---|---|--- +{@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java}|Import data from another system to Pulsar.|[RabbitMQ source connector](io-rabbitmq.md) imports the messages of a RabbitMQ queue to a Pulsar topic. +{@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java}|Export data from Pulsar to another system.|[Kinesis sink connector](io-kinesis.md) exports the messages of a Pulsar topic to a Kinesis stream. + +## Develop + +You can develop Pulsar source connectors and sink connectors. + +### Source + +Developing a source connector is to implement the {@inject: github:Source:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} +interface, which means you need to implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method and the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sourceContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SourceContext sourceContext) throws Exception; + + ``` + + This method is called when the source connector is initialized. + + In this method, you can retrieve all connector specific settings through the passed-in `config` parameter and initialize all necessary resources. + + For example, a Kafka connector can create a Kafka client in this `open` method. + + Besides, Pulsar runtime also provides a `SourceContext` for the + connector to access runtime resources for tasks like collecting metrics. The implementation can save the `SourceContext` for future use. + +2. Implement the {@inject: github:read:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} method. + + ```java + + /** + * Reads the next message from source. + * If source does not have any new messages, this call should block. + * @return next message from source. The return result should never be null + * @throws Exception + */ + Record read() throws Exception; + + ``` + + If nothing to return, the implementation should be blocking rather than returning `null`. + + The returned {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should encapsulate the following information, which is needed by Pulsar IO runtime. + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following variables: + + |Variable|Required|Description + |---|---|--- + `TopicName`|No|Pulsar topic name from which the record is originated from. + `Key`|No| Messages can optionally be tagged with keys.

    For more information, see [Routing modes](concepts-messaging.md#routing-modes).| + `Value`|Yes|Actual data of the record. + `EventTime`|No|Event time of the record from the source. + `PartitionId`|No| If the record is originated from a partitioned source, it returns its `PartitionId`.

    `PartitionId` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `RecordSequence`|No|If the record is originated from a sequential source, it returns its `RecordSequence`.

    `RecordSequence` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `Properties` |No| If the record carries user-defined properties, it returns those properties. + `DestinationTopic`|No|Topic to which message should be written. + `Message`|No|A class which carries data sent by users.

    For more information, see [Message.java](https://github.com/apache/pulsar/blob/master/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/Message.java).| + + * {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java} should provide the following methods: + + Method|Description + |---|--- + `ack` |Acknowledge that the record is fully processed. + `fail`|Indicate that the record fails to be processed. + +## Handle schema information + +Pulsar IO automatically handles the schema and provides a strongly typed API based on Java generics. +If you know the schema type that you are producing, you can declare the Java class relative to that type in your sink declaration. + +``` + +public class MySource implements Source { + public Record read() {} +} + +``` + +If you want to implement a source that works with any schema, you can go with `byte[]` (of `ByteBuffer`) and use Schema.AUTO_PRODUCE_BYTES(). + +``` + +public class MySource implements Source { + public Record read() { + + Schema wantedSchema = .... + Record myRecord = new MyRecordImplementation(); + .... + } + class MyRecordImplementation implements Record { + public byte[] getValue() { + return ....encoded byte[]...that represents the value + } + public Schema getSchema() { + return Schema.AUTO_PRODUCE_BYTES(wantedSchema); + } + } +} + +``` + +To handle the `KeyValue` type properly, follow the guidelines for your record implementation: +- It must implement {@inject: github:Record:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/KVRecord.java} interface and implement `getKeySchema`,`getValueSchema`, and `getKeyValueEncodingType` +- It must return a `KeyValue` object as `Record.getValue()` +- It may return null in `Record.getSchema()` + +When Pulsar IO runtime encounters a `KVRecord`, it brings the following changes automatically: +- Set properly the `KeyValueSchema` +- Encode the Message Key and the Message Value according to the `KeyValueEncoding` (SEPARATED or INLINE) + +:::tip + +For more information about **how to create a source connector**, see {@inject: github:KafkaSource:/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java}. + +::: + +### Sink + +Developing a sink connector **is similar to** developing a source connector, that is, you need to implement the {@inject: github:Sink:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} interface, which means implementing the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method and the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + +1. Implement the {@inject: github:open:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Open connector with configuration + * + * @param config initialization config + * @param sinkContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SinkContext sinkContext) throws Exception; + + ``` + +2. Implement the {@inject: github:write:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} method. + + ```java + + /** + * Write a message to Sink + * @param record record to write to sink + * @throws Exception + */ + void write(Record record) throws Exception; + + ``` + + During the implementation, you can decide how to write the `Value` and + the `Key` to the actual source, and leverage all the provided information such as + `PartitionId` and `RecordSequence` to achieve different processing guarantees. + + You also need to ack records (if messages are sent successfully) or fail records (if messages fail to send). + +## Handling Schema information + +Pulsar IO handles automatically the Schema and provides a strongly typed API based on Java generics. +If you know the Schema type that you are consuming from you can declare the Java class relative to that type in your Sink declaration. + +``` + +public class MySink implements Sink { + public void write(Record record) {} +} + +``` + +If you want to implement a sink that works with any schema, you can you go with the special GenericObject interface. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + if (genericObject != null) { + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + ... + } + .... + } +} + +``` + +In the case of AVRO, JSON, and Protobuf records (schemaType=AVRO,JSON,PROTOBUF_NATIVE), you can cast the +`genericObject` variable to `GenericRecord` and use `getFields()` and `getField()` API. +You are able to access the native AVRO record using `genericObject.getNativeObject()`. + +In the case of KeyValue type, you can access both the schema for the key and the schema for the value using this code. + +``` + +public class MySink implements Sink { + public void write(Record record) { + Schema schema = record.getSchema(); + GenericObject genericObject = record.getValue(); + SchemaType type = genericObject.getSchemaType(); + Object nativeObject = genericObject.getNativeObject(); + if (type == SchemaType.KEY_VALUE) { + KeyValue keyValue = (KeyValue) nativeObject; + Object key = keyValue.getKey(); + Object value = keyValue.getValue(); + + KeyValueSchema keyValueSchema = (KeyValueSchema) schema; + Schema keySchema = keyValueSchema.getKeySchema(); + Schema valueSchema = keyValueSchema.getValueSchema(); + } + .... + } +} + +``` + +## Test + +Testing connectors can be challenging because Pulsar IO connectors interact with two systems +that may be difficult to mock—Pulsar and the system to which the connector is connecting. + +It is +recommended writing special tests to test the connector functionalities as below +while mocking the external service. + +### Unit test + +You can create unit tests for your connector. + +### Integration test + +Once you have written sufficient unit tests, you can add +separate integration tests to verify end-to-end functionality. + +Pulsar uses [testcontainers](https://www.testcontainers.org/) **for all integration tests**. + +:::tip + +For more information about **how to create integration tests for Pulsar connectors**, see {@inject: github:IntegrationTests:/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io}. + +::: + +## Package + +Once you've developed and tested your connector, you need to package it so that it can be submitted +to a [Pulsar Functions](functions-overview.md) cluster. + +There are two methods to +work with Pulsar Functions' runtime, that is, [NAR](#nar) and [uber JAR](#uber-jar). + +:::note + +If you plan to package and distribute your connector for others to use, you are obligated to + +::: + +license and copyright your own code properly. Remember to add the license and copyright to +all libraries your code uses and to your distribution. +> +> If you use the [NAR](#nar) method, the NAR plugin +automatically creates a `DEPENDENCIES` file in the generated NAR package, including the proper +licensing and copyrights of all libraries of your connector. + +### NAR + +**NAR** stands for NiFi Archive, which is a custom packaging mechanism used by Apache NiFi, to provide +a bit of Java ClassLoader isolation. + +:::tip + +For more information about **how NAR works**, see [here](https://medium.com/hashmapinc/nifi-nar-files-explained-14113f7796fd). + +::: + +Pulsar uses the same mechanism for packaging **all** [built-in connectors](io-connectors.md). + +The easiest approach to package a Pulsar connector is to create a NAR package using [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin). + +Include this [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin) in your maven project for your connector as below. + +```xml + + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + + + +``` + +You must also create a `resources/META-INF/services/pulsar-io.yaml` file with the following contents: + +```yaml + +name: connector name +description: connector description +sourceClass: fully qualified class name (only if source connector) +sinkClass: fully qualified class name (only if sink connector) + +``` + +For Gradle users, there is a [Gradle Nar plugin available on the Gradle Plugin Portal](https://plugins.gradle.org/plugin/io.github.lhotari.gradle-nar-plugin). + +:::tip + +For more information about an **how to use NAR for Pulsar connectors**, see {@inject: github:TwitterFirehose:/pulsar-io/twitter/pom.xml}. + +::: + +### Uber JAR + +An alternative approach is to create an **uber JAR** that contains all of the connector's JAR files +and other resource files. No directory internal structure is necessary. + +You can use [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/examples/includes-excludes.html) to create a uber JAR as below: + +```xml + + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + *:* + + + + + + + +``` + +## Monitor + +Pulsar connectors enable you to move data in and out of Pulsar easily. It is important to ensure that the running connectors are healthy at any time. You can monitor Pulsar connectors that have been deployed with the following methods: + +- Check the metrics provided by Pulsar. + + Pulsar connectors expose the metrics that can be collected and used for monitoring the health of **Java** connectors. You can check the metrics by following the [monitoring](deploy-monitoring.md) guide. + +- Set and check your customized metrics. + + In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java** connectors. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here is an example of how to customize metrics for a Java connector. + +````mdx-code-block + + + +``` + +public class TestMetricSink implements Sink { + + @Override + public void open(Map config, SinkContext sinkContext) throws Exception { + sinkContext.recordMetric("foo", 1); + } + + @Override + public void write(Record record) throws Exception { + + } + + @Override + public void close() throws Exception { + + } + } + +``` + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/io-dynamodb-source.md b/site2/website/versioned_docs/version-2.9.x/io-dynamodb-source.md new file mode 100644 index 0000000000000..ce585786eb042 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-dynamodb-source.md @@ -0,0 +1,80 @@ +--- +id: io-dynamodb-source +title: AWS DynamoDB source connector +sidebar_label: "AWS DynamoDB source connector" +original_id: io-dynamodb-source +--- + +The DynamoDB source connector pulls data from DynamoDB table streams and persists data into Pulsar. + +This connector uses the [DynamoDB Streams Kinesis Adapter](https://github.com/awslabs/dynamodb-streams-kinesis-adapter), +which uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual +consuming of messages. The KCL uses DynamoDB to track state for consumers and requires cloudwatch access to log metrics. + + +## Configuration + +The configuration of the DynamoDB source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the KCL application. Must be unique, as it is used to define the table name for the dynamo table used for state tracking.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the KCL checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsEndpoint`|String|false|" " (empty string)|The DynamoDB Streams end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsDynamodbStreamArn`|String|true|" " (empty string)|The DynamoDB stream arn. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the DynamoDB source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsDynamodbStreamArn": "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsDynamodbStreamArn: "arn:aws:dynamodb:us-west-2:111122223333:table/TestTable/stream/2015-05-11T21:21:33.291" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-elasticsearch-sink.md b/site2/website/versioned_docs/version-2.9.x/io-elasticsearch-sink.md new file mode 100644 index 0000000000000..b5757b3094a9a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-elasticsearch-sink.md @@ -0,0 +1,242 @@ +--- +id: io-elasticsearch-sink +title: Elasticsearch sink connector +sidebar_label: "Elasticsearch sink connector" +original_id: io-elasticsearch-sink +--- + +The Elasticsearch sink connector pulls messages from Pulsar topics and persists the messages to indexes. + + +## Feature + +### Handle data + +Since Pulsar 2.9.0, the Elasticsearch sink connector has the following ways of +working. You can choose one of them. + +Name | Description +---|---| +Raw processing | The sink reads from topics and passes the raw content to Elasticsearch.

    This is the **default** behavior.

    Raw processing was already available **in Pulsar 2.8.x**. +Schema aware | The sink uses the schema and handles AVRO, JSON, and KeyValue schema types while mapping the content to the Elasticsearch document.

    If you set `schemaEnable` to `true`, the sink interprets the contents of the message and you can define a **primary key** that in turn used as the special `_id` field on Elasticsearch. +

    This allows you to perform `UPDATE`, `INSERT`, and `DELETE` operations +to Elasticsearch driven by the logical primary key of the message.

    This +is very useful in a typical Change Data Capture scenario in which you follow the +changes on your database, write them to Pulsar (using the Debezium adapter for +instance), and then you write to Elasticsearch.

    You configure the +mapping of the primary key using the `primaryFields` configuration +entry.

    The `DELETE` operation can be performed when the primary key is +not empty and the remaining value is empty. Use the `nullValueAction` to +configure this behaviour. The default configuration simply ignores such empty +values. + +### Map multiple indexes + +Since Pulsar 2.9.0, the `indexName` property is no more required. If you omit it, the sink writes to an index name after the Pulsar topic name. + +### Enable bulk writes + +Since Pulsar 2.9.0, you can use bulk writes by setting the `bulkEnabled` property to `true`. + +### Enable secure connections via TLS + +Since Pulsar 2.9.0, you can enable secure connections with TLS. + +## Configuration + +The configuration of the Elasticsearch sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `elasticSearchUrl` | String| true |" " (empty string)| The URL of elastic search cluster to which the connector connects. | +| `indexName` | String| true |" " (empty string)| The index name to which the connector writes messages. | +| `schemaEnable` | Boolean | false | false | Turn on the Schema Aware mode. | +| `createIndexIfNeeded` | Boolean | false | false | Manage index if missing. | +| `maxRetries` | Integer | false | 1 | The maximum number of retries for elasticsearch requests. Use -1 to disable it. | +| `retryBackoffInMs` | Integer | false | 100 | The base time to wait when retrying an Elasticsearch request (in milliseconds). | +| `maxRetryTimeInSec` | Integer| false | 86400 | The maximum retry time interval in seconds for retrying an elasticsearch request. | +| `bulkEnabled` | Boolean | false | false | Enable the elasticsearch bulk processor to flush write requests based on the number or size of requests, or after a given period. | +| `bulkActions` | Integer | false | 1000 | The maximum number of actions per elasticsearch bulk request. Use -1 to disable it. | +| `bulkSizeInMb` | Integer | false |5 | The maximum size in megabytes of elasticsearch bulk requests. Use -1 to disable it. | +| `bulkConcurrentRequests` | Integer | false | 0 | The maximum number of in flight elasticsearch bulk requests. The default 0 allows the execution of a single request. A value of 1 means 1 concurrent request is allowed to be executed while accumulating new bulk requests. | +| `bulkFlushIntervalInMs` | Integer | false | -1 | The maximum period of time to wait for flushing pending writes when bulk writes are enabled. Default is -1 meaning not set. | +| `compressionEnabled` | Boolean | false |false | Enable elasticsearch request compression. | +| `connectTimeoutInMs` | Integer | false |5000 | The elasticsearch client connection timeout in milliseconds. | +| `connectionRequestTimeoutInMs` | Integer | false |1000 | The time in milliseconds for getting a connection from the elasticsearch connection pool. | +| `connectionIdleTimeoutInMs` | Integer | false |5 | Idle connection timeout to prevent a read timeout. | +| `keyIgnore` | Boolean | false |true | Whether to ignore the record key to build the Elasticsearch document `_id`. If primaryFields is defined, the connector extract the primary fields from the payload to build the document `_id` If no primaryFields are provided, elasticsearch auto generates a random document `_id`. | +| `primaryFields` | String | false | "id" | The comma separated ordered list of field names used to build the Elasticsearch document `_id` from the record value. If this list is a singleton, the field is converted as a string. If this list has 2 or more fields, the generated `_id` is a string representation of a JSON array of the field values. | +| `nullValueAction` | enum (IGNORE,DELETE,FAIL) | false | IGNORE | How to handle records with null values, possible options are IGNORE, DELETE or FAIL. Default is IGNORE the message. | +| `malformedDocAction` | enum (IGNORE,WARN,FAIL) | false | FAIL | How to handle elasticsearch rejected documents due to some malformation. Possible options are IGNORE, DELETE or FAIL. Default is FAIL the Elasticsearch document. | +| `stripNulls` | Boolean | false |true | If stripNulls is false, elasticsearch _source includes 'null' for empty fields (for example {"foo": null}), otherwise null fields are stripped. | +| `socketTimeoutInMs` | Integer | false |60000 | The socket timeout in milliseconds waiting to read the elasticsearch response. | +| `typeName` | String | false | "_doc" | The type name to which the connector writes messages to.

    The value should be set explicitly to a valid type name other than "_doc" for Elasticsearch version before 6.2, and left to default otherwise. | +| `indexNumberOfShards` | int| false |1| The number of shards of the index. | +| `indexNumberOfReplicas` | int| false |1 | The number of replicas of the index. | +| `username` | String| false |" " (empty string)| The username used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `password` | String| false | " " (empty string)|The password used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `ssl` | ElasticSearchSslConfig | false | | Configuration for TLS encrypted communication | + +### Definition of ElasticSearchSslConfig structure: + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `enabled` | Boolean| false | false | Enable SSL/TLS. | +| `hostnameVerification` | Boolean| false | true | Whether or not to validate node hostnames when using SSL. | +| `truststorePath` | String| false |" " (empty string)| The path to the truststore file. | +| `truststorePassword` | String| false |" " (empty string)| Truststore password. | +| `keystorePath` | String| false |" " (empty string)| The path to the keystore file. | +| `keystorePassword` | String| false |" " (empty string)| Keystore password. | +| `cipherSuites` | String| false |" " (empty string)| SSL/TLS cipher suites. | +| `protocols` | String| false |"TLSv1.2" | Comma separated list of enabled SSL/TLS protocols. | + +## Example + +Before using the Elasticsearch sink connector, you need to create a configuration file through one of the following methods. + +### Configuration + +#### For Elasticsearch After 6.2 + +* JSON + + ```json + + { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "username": "scooby", + "password": "doobie" + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + username: "scooby" + password: "doobie" + + ``` + +#### For Elasticsearch Before 6.2 + +* JSON + + ```json + + { + "elasticSearchUrl": "http://localhost:9200", + "indexName": "my_index", + "typeName": "doc", + "username": "scooby", + "password": "doobie" + } + + ``` + +* YAML + + ```yaml + + configs: + elasticSearchUrl: "http://localhost:9200" + indexName: "my_index" + typeName: "doc" + username: "scooby" + password: "doobie" + + ``` + +### Usage + +1. Start a single node Elasticsearch cluster. + + ```bash + + $ docker run -p 9200:9200 -p 9300:9300 \ + -e "discovery.type=single-node" \ + docker.elastic.co/elasticsearch/elasticsearch:7.13.3 + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + + Make sure the NAR file is available at `connectors/pulsar-io-elastic-search-@pulsar:version@.nar`. + +3. Start the Pulsar Elasticsearch connector in local run mode using one of the following methods. + * Use the **JSON** configuration as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config '{"elasticSearchUrl":"http://localhost:9200","indexName": "my_index","username": "scooby","password": "doobie"}' \ + --inputs elasticsearch_test + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-elastic-search-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name elasticsearch-test-sink \ + --sink-config-file elasticsearch-sink.yml \ + --inputs elasticsearch_test + + ``` + +4. Publish records to the topic. + + ```bash + + $ bin/pulsar-client produce elasticsearch_test --messages "{\"a\":1}" + + ``` + +5. Check documents in Elasticsearch. + + * refresh the index + + ```bash + + $ curl -s http://localhost:9200/my_index/_refresh + + ``` + + + * search documents + + ```bash + + $ curl -s http://localhost:9200/my_index/_search + + ``` + + You can see the record that published earlier has been successfully written into Elasticsearch. + + ```json + + {"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"skipped":0,"failed":0},"hits":{"total":{"value":1,"relation":"eq"},"max_score":1.0,"hits":[{"_index":"my_index","_type":"_doc","_id":"FSxemm8BLjG_iC0EeTYJ","_score":1.0,"_source":{"a":1}}]}} + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-file-source.md b/site2/website/versioned_docs/version-2.9.x/io-file-source.md new file mode 100644 index 0000000000000..e9d710cce65e8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-file-source.md @@ -0,0 +1,160 @@ +--- +id: io-file-source +title: File source connector +sidebar_label: "File source connector" +original_id: io-file-source +--- + +The File source connector pulls messages from files in directories and persists the messages to Pulsar topics. + +## Configuration + +The configuration of the File source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `inputDirectory` | String|true | No default value|The input directory to pull files. | +| `recurse` | Boolean|false | true | Whether to pull files from subdirectory or not.| +| `keepFile` |Boolean|false | false | If set to true, the file is not deleted after it is processed, which means the file can be picked up continually. | +| `fileFilter` | String|false| [^\\.].* | The file whose name matches the given regular expression is picked up. | +| `pathFilter` | String |false | NULL | If `recurse` is set to true, the subdirectory whose path matches the given regular expression is scanned. | +| `minimumFileAge` | Integer|false | 0 | The minimum age that a file can be processed.

    Any file younger than `minimumFileAge` (according to the last modification date) is ignored. | +| `maximumFileAge` | Long|false |Long.MAX_VALUE | The maximum age that a file can be processed.

    Any file older than `maximumFileAge` (according to last modification date) is ignored. | +| `minimumSize` |Integer| false |1 | The minimum size (in bytes) that a file can be processed. | +| `maximumSize` | Double|false |Double.MAX_VALUE| The maximum size (in bytes) that a file can be processed. | +| `ignoreHiddenFiles` |Boolean| false | true| Whether the hidden files should be ignored or not. | +| `pollingInterval`|Long | false | 10000L | Indicates how long to wait before performing a directory listing. | +| `numWorkers` | Integer | false | 1 | The number of worker threads that process files.

    This allows you to process a larger number of files concurrently.

    However, setting this to a value greater than 1 makes the data from multiple files mixed in the target topic. | + +### Example + +Before using the File source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "inputDirectory": "/Users/david", + "recurse": true, + "keepFile": true, + "fileFilter": "[^\\.].*", + "pathFilter": "*", + "minimumFileAge": 0, + "maximumFileAge": 9999999999, + "minimumSize": 1, + "maximumSize": 5000000, + "ignoreHiddenFiles": true, + "pollingInterval": 5000, + "numWorkers": 1 + } + + ``` + +* YAML + + ```yaml + + configs: + inputDirectory: "/Users/david" + recurse: true + keepFile: true + fileFilter: "[^\\.].*" + pathFilter: "*" + minimumFileAge: 0 + maximumFileAge: 9999999999 + minimumSize: 1 + maximumSize: 5000000 + ignoreHiddenFiles: true + pollingInterval: 5000 + numWorkers: 1 + + ``` + +## Usage + +Here is an example of using the File source connecter. + +1. Pull a Pulsar image. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + ``` + +2. Start Pulsar standalone. + + ```bash + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +3. Create a configuration file _file-connector.yaml_. + + ```yaml + + configs: + inputDirectory: "/opt" + + ``` + +4. Copy the configuration file _file-connector.yaml_ to the container. + + ```bash + + $ docker cp connectors/file-connector.yaml pulsar-standalone:/pulsar/ + + ``` + +5. Download the File source connector. + + ```bash + + $ curl -O https://mirrors.tuna.tsinghua.edu.cn/apache/pulsar/pulsar-{version}/connectors/pulsar-io-file-{version}.nar + + ``` + +6. Start the File source connector. + + ```bash + + $ docker exec -it pulsar-standalone /bin/bash + + $ ./bin/pulsar-admin sources localrun \ + --archive /pulsar/pulsar-io-file-{version}.nar \ + --name file-test \ + --destination-topic-name pulsar-file-test \ + --source-config-file /pulsar/file-connector.yaml + + ``` + +7. Start a consumer. + + ```bash + + ./bin/pulsar-client consume -s file-test -n 0 pulsar-file-test + + ``` + +8. Write the message to the file _test.txt_. + + ```bash + + echo "hello world!" > /opt/test.txt + + ``` + + The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello world! + + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/io-flume-sink.md b/site2/website/versioned_docs/version-2.9.x/io-flume-sink.md new file mode 100644 index 0000000000000..b2ace53702f8c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-flume-sink.md @@ -0,0 +1,56 @@ +--- +id: io-flume-sink +title: Flume sink connector +sidebar_label: "Flume sink connector" +original_id: io-flume-sink +--- + +The Flume sink connector pulls messages from Pulsar topics to logs. + +## Configuration + +The configuration of the Flume sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume sink connector, you need to create a configuration file through one of the following methods. + +> For more information about the `sink.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/sink.conf). + +* JSON + + ```json + + { + "name": "a1", + "confFile": "sink.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: sink.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-flume-source.md b/site2/website/versioned_docs/version-2.9.x/io-flume-source.md new file mode 100644 index 0000000000000..b7fd7edad8811 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-flume-source.md @@ -0,0 +1,56 @@ +--- +id: io-flume-source +title: Flume source connector +sidebar_label: "Flume source connector" +original_id: io-flume-source +--- + +The Flume source connector pulls messages from logs to Pulsar topics. + +## Configuration + +The configuration of the Flume source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume source connector, you need to create a configuration file through one of the following methods. + +> For more information about the `source.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/source.conf). + +* JSON + + ```json + + { + "name": "a1", + "confFile": "source.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + + ``` + +* YAML + + ```yaml + + configs: + name: a1 + confFile: source.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-hbase-sink.md b/site2/website/versioned_docs/version-2.9.x/io-hbase-sink.md new file mode 100644 index 0000000000000..1737b00fa2680 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-hbase-sink.md @@ -0,0 +1,67 @@ +--- +id: io-hbase-sink +title: HBase sink connector +sidebar_label: "HBase sink connector" +original_id: io-hbase-sink +--- + +The HBase sink connector pulls the messages from Pulsar topics +and persists the messages to HBase tables + +## Configuration + +The configuration of the HBase sink connector has the following properties. + +### Property + +| Name | Type|Default | Required | Description | +|------|---------|----------|-------------|--- +| `hbaseConfigResources` | String|None | false | HBase system configuration `hbase-site.xml` file. | +| `zookeeperQuorum` | String|None | true | HBase system configuration about `hbase.zookeeper.quorum` value. | +| `zookeeperClientPort` | String|2181 | false | HBase system configuration about `hbase.zookeeper.property.clientPort` value. | +| `zookeeperZnodeParent` | String|/hbase | false | HBase system configuration about `zookeeper.znode.parent` value. | +| `tableName` | None |String | true | HBase table, the value is `namespace:tableName`. | +| `rowKeyName` | String|None | true | HBase table rowkey name. | +| `familyName` | String|None | true | HBase table column family name. | +| `qualifierNames` |String| None | true | HBase table column qualifier names. | +| `batchTimeMs` | Long|1000l| false | HBase table operation timeout in milliseconds. | +| `batchSize` | int|200| false | Batch size of updates made to the HBase table. | + +### Example + +Before using the HBase sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hbaseConfigResources": "hbase-site.xml", + "zookeeperQuorum": "localhost", + "zookeeperClientPort": "2181", + "zookeeperZnodeParent": "/hbase", + "tableName": "pulsar_hbase", + "rowKeyName": "rowKey", + "familyName": "info", + "qualifierNames": [ 'name', 'address', 'age'] + } + + ``` + +* YAML + + ```yaml + + configs: + hbaseConfigResources: "hbase-site.xml" + zookeeperQuorum: "localhost" + zookeeperClientPort: "2181" + zookeeperZnodeParent: "/hbase" + tableName: "pulsar_hbase" + rowKeyName: "rowKey" + familyName: "info" + qualifierNames: [ 'name', 'address', 'age'] + + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/io-hdfs2-sink.md b/site2/website/versioned_docs/version-2.9.x/io-hdfs2-sink.md new file mode 100644 index 0000000000000..4a8527154430d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-hdfs2-sink.md @@ -0,0 +1,64 @@ +--- +id: io-hdfs2-sink +title: HDFS2 sink connector +sidebar_label: "HDFS2 sink connector" +original_id: io-hdfs2-sink +--- + +The HDFS2 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS2 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| true, if `compression` is set to `None`. | None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| true | None | The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. +| `subdirectoryPattern` | String | false | None | A subdirectory associated with the created time of the sink.
    The pattern is the formatted pattern of `directory`'s subdirectory.

    See [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html) for pattern's syntax. | + +### Example + +Before using the HDFS2 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "fileExtension": ".log", + "compression": "SNAPPY", + "subdirectoryPattern": "yyyy-MM-dd" + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + fileExtension: ".log" + compression: "SNAPPY" + subdirectoryPattern: "yyyy-MM-dd" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-hdfs3-sink.md b/site2/website/versioned_docs/version-2.9.x/io-hdfs3-sink.md new file mode 100644 index 0000000000000..aec065a25db7f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-hdfs3-sink.md @@ -0,0 +1,59 @@ +--- +id: io-hdfs3-sink +title: HDFS3 sink connector +sidebar_label: "HDFS3 sink connector" +original_id: io-hdfs3-sink +--- + +The HDFS3 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS3 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY
  • | +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| false |None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| false | None| The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. + +### Example + +Before using the HDFS3 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "compression": "SNAPPY" + } + + ``` + +* YAML + + ```yaml + + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + compression: "SNAPPY" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-influxdb-sink.md b/site2/website/versioned_docs/version-2.9.x/io-influxdb-sink.md new file mode 100644 index 0000000000000..9382f8c03121c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-influxdb-sink.md @@ -0,0 +1,119 @@ +--- +id: io-influxdb-sink +title: InfluxDB sink connector +sidebar_label: "InfluxDB sink connector" +original_id: io-influxdb-sink +--- + +The InfluxDB sink connector pulls messages from Pulsar topics +and persists the messages to InfluxDB. + +The InfluxDB sink provides different configurations for InfluxDBv1 and v2 respectively. + +## Configuration + +The configuration of the InfluxDB sink connector has the following properties. + +### Property +#### InfluxDBv2 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `token` | String|true| " " (empty string) |The authentication token used to authenticate to InfluxDB. | +| `organization` | String| true|" " (empty string) | The InfluxDB organization to write to. | +| `bucket` |String| true | " " (empty string)| The InfluxDB bucket to write to. | +| `precision` | String|false| ns | The timestamp precision for writing data to InfluxDB.

    Below are the available options:
  • ns
  • us
  • ms
  • s
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +#### InfluxDBv1 +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `username` | String|false| " " (empty string) |The username used to authenticate to InfluxDB. | +| `password` | String| false|" " (empty string) | The password used to authenticate to InfluxDB. | +| `database` |String| true | " " (empty string)| The InfluxDB to which write messages. | +| `consistencyLevel` | String|false|ONE | The consistency level for writing data to InfluxDB.

    Below are the available options:
  • ALL
  • ANY
  • ONE
  • QUORUM
  • | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL
  • | +| `retentionPolicy` | String|false| autogen| The retention policy for InfluxDB. | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +### Example +Before using the InfluxDB sink connector, you need to create a configuration file through one of the following methods. +#### InfluxDBv2 +* JSON + + ```json + + { + "influxdbUrl": "http://localhost:9999", + "organization": "example-org", + "bucket": "example-bucket", + "token": "xxxx", + "precision": "ns", + "logLevel": "NONE", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + + ``` + + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:9999" + organization: "example-org" + bucket: "example-bucket" + token: "xxxx" + precision: "ns" + logLevel: "NONE" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + + +#### InfluxDBv1 + +* JSON + + ```json + + { + "influxdbUrl": "http://localhost:8086", + "database": "test_db", + "consistencyLevel": "ONE", + "logLevel": "NONE", + "retentionPolicy": "autogen", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + + ``` + +* YAML + + ```yaml + + configs: + influxdbUrl: "http://localhost:8086" + database: "test_db" + consistencyLevel: "ONE" + logLevel: "NONE" + retentionPolicy: "autogen" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-jdbc-sink.md b/site2/website/versioned_docs/version-2.9.x/io-jdbc-sink.md new file mode 100644 index 0000000000000..77dbb61fccd7e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-jdbc-sink.md @@ -0,0 +1,157 @@ +--- +id: io-jdbc-sink +title: JDBC sink connector +sidebar_label: "JDBC sink connector" +original_id: io-jdbc-sink +--- + +The JDBC sink connectors allow pulling messages from Pulsar topics +and persists the messages to ClickHouse, MariaDB, PostgreSQL, and SQLite. + +> Currently, INSERT, DELETE and UPDATE operations are supported. + +## Configuration + +The configuration of all JDBC sink connectors has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `userName` | String|false | " " (empty string) | The username used to connect to the database specified by `jdbcUrl`.

    **Note: `userName` is case-sensitive.**| +| `password` | String|false | " " (empty string)| The password used to connect to the database specified by `jdbcUrl`.

    **Note: `password` is case-sensitive.**| +| `jdbcUrl` | String|true | " " (empty string) | The JDBC URL of the database to which the connector connects. | +| `tableName` | String|true | " " (empty string) | The name of the table to which the connector writes. | +| `nonKey` | String|false | " " (empty string) | A comma-separated list contains the fields used in updating events. | +| `key` | String|false | " " (empty string) | A comma-separated list contains the fields used in `where` condition of updating and deleting events. | +| `timeoutMs` | int| false|500 | The JDBC operation timeout in milliseconds. | +| `batchSize` | int|false | 200 | The batch size of updates made to the database. | + +### Example for ClickHouse + +* JSON + + ```json + + { + "userName": "clickhouse", + "password": "password", + "jdbcUrl": "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink", + "tableName": "pulsar_clickhouse_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-clickhouse-sink" + topicName: "persistent://public/default/jdbc-clickhouse-topic" + sinkType: "jdbc-clickhouse" + configs: + userName: "clickhouse" + password: "password" + jdbcUrl: "jdbc:clickhouse://localhost:8123/pulsar_clickhouse_jdbc_sink" + tableName: "pulsar_clickhouse_jdbc_sink" + + ``` + +### Example for MariaDB + +* JSON + + ```json + + { + "userName": "mariadb", + "password": "password", + "jdbcUrl": "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink", + "tableName": "pulsar_mariadb_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-mariadb-sink" + topicName: "persistent://public/default/jdbc-mariadb-topic" + sinkType: "jdbc-mariadb" + configs: + userName: "mariadb" + password: "password" + jdbcUrl: "jdbc:mariadb://localhost:3306/pulsar_mariadb_jdbc_sink" + tableName: "pulsar_mariadb_jdbc_sink" + + ``` + +### Example for PostgreSQL + +Before using the JDBC PostgreSQL sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "userName": "postgres", + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "tableName": "pulsar_postgres_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-postgres-sink" + topicName: "persistent://public/default/jdbc-postgres-topic" + sinkType: "jdbc-postgres" + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +For more information on **how to use this JDBC sink connector**, see [connect Pulsar to PostgreSQL](io-quickstart.md#connect-pulsar-to-postgresql). + +### Example for SQLite + +* JSON + + ```json + + { + "jdbcUrl": "jdbc:sqlite:db.sqlite", + "tableName": "pulsar_sqlite_jdbc_sink" + } + + ``` + +* YAML + + ```yaml + + tenant: "public" + namespace: "default" + name: "jdbc-sqlite-sink" + topicName: "persistent://public/default/jdbc-sqlite-topic" + sinkType: "jdbc-sqlite" + configs: + jdbcUrl: "jdbc:sqlite:db.sqlite" + tableName: "pulsar_sqlite_jdbc_sink" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-kafka-sink.md b/site2/website/versioned_docs/version-2.9.x/io-kafka-sink.md new file mode 100644 index 0000000000000..09dad4ce70bac --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-kafka-sink.md @@ -0,0 +1,72 @@ +--- +id: io-kafka-sink +title: Kafka sink connector +sidebar_label: "Kafka sink connector" +original_id: io-kafka-sink +--- + +The Kafka sink connector pulls messages from Pulsar topics and persists the messages +to Kafka topics. + +This guide explains how to configure and use the Kafka sink connector. + +## Configuration + +The configuration of the Kafka sink connector has the following parameters. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +|`acks`|String|true|" " (empty string) |The number of acknowledgments that the producer requires the leader to receive before a request completes.
    This controls the durability of the sent records. +|`batchsize`|long|false|16384L|The batch size that a Kafka producer attempts to batch records together before sending them to brokers. +|`maxRequestSize`|long|false|1048576L|The maximum size of a Kafka request in bytes. +|`topic`|String|true|" " (empty string) |The Kafka topic which receives messages from Pulsar. +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringSerializer | The serializer class for Kafka producers to serialize keys. +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArraySerializer | The serializer class for Kafka producers to serialize values.

    The serializer is set by a specific implementation of [`KafkaAbstractSink`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java). +|`producerConfigProperties`|Map|false|" " (empty string)|The producer configuration properties to be passed to producers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. + + +### Example + +Before using the Kafka sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "bootstrapServers": "localhost:6667", + "topic": "test", + "acks": "1", + "batchSize": "16384", + "maxRequestSize": "1048576", + "producerConfigProperties": + { + "client.id": "test-pulsar-producer", + "security.protocol": "SASL_PLAINTEXT", + "sasl.mechanism": "GSSAPI", + "sasl.kerberos.service.name": "kafka", + "acks": "all" + } + } + +* YAML + + ``` + +yaml + configs: + bootstrapServers: "localhost:6667" + topic: "test" + acks: "1" + batchSize: "16384" + maxRequestSize: "1048576" + producerConfigProperties: + client.id: "test-pulsar-producer" + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "GSSAPI" + sasl.kerberos.service.name: "kafka" + acks: "all" + ``` diff --git a/site2/website/versioned_docs/version-2.9.x/io-kafka-source.md b/site2/website/versioned_docs/version-2.9.x/io-kafka-source.md new file mode 100644 index 0000000000000..53448699e21b4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-kafka-source.md @@ -0,0 +1,226 @@ +--- +id: io-kafka-source +title: Kafka source connector +sidebar_label: "Kafka source connector" +original_id: io-kafka-source +--- + +The Kafka source connector pulls messages from Kafka topics and persists the messages +to Pulsar topics. + +This guide explains how to configure and use the Kafka source connector. + +## Configuration + +The configuration of the Kafka source connector has the following properties. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +| `groupId` |String| true | " " (empty string) | A unique string that identifies the group of consumer processes to which this consumer belongs. | +| `fetchMinBytes` | long|false | 1 | The minimum byte expected for each fetch response. | +| `autoCommitEnabled` | boolean |false | true | If set to true, the consumer's offset is periodically committed in the background.

    This committed offset is used when the process fails as the position from which a new consumer begins. | +| `autoCommitIntervalMs` | long|false | 5000 | The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if `autoCommitEnabled` is set to true. | +| `heartbeatIntervalMs` | long| false | 3000 | The interval between heartbeats to the consumer when using Kafka's group management facilities.

    **Note: `heartbeatIntervalMs` must be smaller than `sessionTimeoutMs`**.| +| `sessionTimeoutMs` | long|false | 30000 | The timeout used to detect consumer failures when using Kafka's group management facility. | +| `topic` | String|true | " " (empty string)| The Kafka topic which sends messages to Pulsar. | +| `consumerConfigProperties` | Map| false | " " (empty string) | The consumer configuration properties to be passed to consumers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. | +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringDeserializer | The deserializer class for Kafka consumers to deserialize keys.
    The deserializer is set by a specific implementation of [`KafkaAbstractSource`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java). +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArrayDeserializer | The deserializer class for Kafka consumers to deserialize values. +| `autoOffsetReset` | String | false | "earliest" | The default offset reset policy. | + +### Schema Management + +This Kafka source connector applies the schema to the topic depending on the data type that is present on the Kafka topic. +You can detect the data type from the `keyDeserializationClass` and `valueDeserializationClass` configuration parameters. + +If the `valueDeserializationClass` is `org.apache.kafka.common.serialization.StringDeserializer`, you can set Schema.STRING() as schema type on the Pulsar topic. + +If `valueDeserializationClass` is `io.confluent.kafka.serializers.KafkaAvroDeserializer`, Pulsar downloads the AVRO schema from the Confluent Schema Registry® +and sets it properly on the Pulsar topic. + +In this case, you need to set `schema.registry.url` inside of the `consumerConfigProperties` configuration entry +of the source. + +If `keyDeserializationClass` is not `org.apache.kafka.common.serialization.StringDeserializer`, it means +that you do not have a String as key and the Kafka Source uses the KeyValue schema type with the SEPARATED encoding. + +Pulsar supports AVRO format for keys. + +In this case, you can have a Pulsar topic with the following properties: +- Schema: KeyValue schema with SEPARATED encoding +- Key: the content of key of the Kafka message (base64 encoded) +- Value: the content of value of the Kafka message +- KeySchema: the schema detected from `keyDeserializationClass` +- ValueSchema: the schema detected from `valueDeserializationClass` + +Topic compaction and partition routing use the Pulsar key, that contains the Kafka key, and so they are driven by the same value that you have on Kafka. + +When you consume data from Pulsar topics, you can use the `KeyValue` schema. In this way, you can decode the data properly. +If you want to access the raw key, you can use the `Message#getKeyBytes()` API. + +### Example + +Before using the Kafka source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "bootstrapServers": "pulsar-kafka:9092", + "groupId": "test-pulsar-io", + "topic": "my-topic", + "sessionTimeoutMs": "10000", + "autoCommitEnabled": false + } + + ``` + +* YAML + + ```yaml + + configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: false + + ``` + +## Usage + +Here is an example of using the Kafka source connector with the configuration file as shown previously. + +1. Download a Kafka client and a Kafka connector. + + ```bash + + $ wget https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/0.10.2.1/kafka-clients-0.10.2.1.jar + + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.4.0/connectors/pulsar-io-kafka-2.4.0.nar + + ``` + +2. Create a network. + + ```bash + + $ docker network create kafka-pulsar + + ``` + +3. Pull a ZooKeeper image and start ZooKeeper. + + ```bash + + $ docker pull wurstmeister/zookeeper + + $ docker run -d -it -p 2181:2181 --name pulsar-kafka-zookeeper --network kafka-pulsar wurstmeister/zookeeper + + ``` + +4. Pull a Kafka image and start Kafka. + + ```bash + + $ docker pull wurstmeister/kafka:2.11-1.0.2 + + $ docker run -d -it --network kafka-pulsar -p 6667:6667 -p 9092:9092 -e KAFKA_ADVERTISED_HOST_NAME=pulsar-kafka -e KAFKA_ZOOKEEPER_CONNECT=pulsar-kafka-zookeeper:2181 --name pulsar-kafka wurstmeister/kafka:2.11-1.0.2 + + ``` + +5. Pull a Pulsar image and start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:@pulsar:version@ + + $ docker run -d -it --network kafka-pulsar -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-kafka-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + + ``` + +6. Create a producer file _kafka-producer.py_. + + ```python + + from kafka import KafkaProducer + producer = KafkaProducer(bootstrap_servers='pulsar-kafka:9092') + future = producer.send('my-topic', b'hello world') + future.get() + + ``` + +7. Create a consumer file _pulsar-client.py_. + + ```python + + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', subscription_name='my-aa') + + while True: + msg = consumer.receive() + print msg + print dir(msg) + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + + ``` + +8. Copy the following files to Pulsar. + + ```bash + + $ docker cp pulsar-io-kafka-@pulsar:version@.nar pulsar-kafka-standalone:/pulsar + $ docker cp kafkaSourceConfig.yaml pulsar-kafka-standalone:/pulsar/conf + $ docker cp pulsar-client.py pulsar-kafka-standalone:/pulsar/ + $ docker cp kafka-producer.py pulsar-kafka-standalone:/pulsar/ + + ``` + +9. Open a new terminal window and start the Kafka source connector in local run mode. + + ```bash + + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ ./bin/pulsar-admin source localrun \ + --archive ./pulsar-io-kafka-@pulsar:version@.nar \ + --classname org.apache.pulsar.io.kafka.KafkaBytesSource \ + --tenant public \ + --namespace default \ + --name kafka \ + --destination-topic-name my-topic \ + --source-config-file ./conf/kafkaSourceConfig.yaml \ + --parallelism 1 + + ``` + +10. Open a new terminal window and run the consumer. + + ```bash + + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ pip install kafka-python + + $ python3 kafka-producer.py + + ``` + + The following information appears on the consumer terminal window. + + ```bash + + Received message: 'hello world' + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-kinesis-sink.md b/site2/website/versioned_docs/version-2.9.x/io-kinesis-sink.md new file mode 100644 index 0000000000000..153587dcfc783 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-kinesis-sink.md @@ -0,0 +1,80 @@ +--- +id: io-kinesis-sink +title: Kinesis sink connector +sidebar_label: "Kinesis sink connector" +original_id: io-kinesis-sink +--- + +The Kinesis sink connector pulls data from Pulsar and persists data into Amazon Kinesis. + +## Configuration + +The configuration of the Kinesis sink connector has the following property. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`messageFormat`|MessageFormat|true|ONLY_RAW_PAYLOAD|Message format in which Kinesis sink converts Pulsar messages and publishes to Kinesis streams.

    Below are the available options:

  • `ONLY_RAW_PAYLOAD`: Kinesis sink directly publishes Pulsar message payload as a message into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_JSON`: Kinesis sink creates a JSON payload with Pulsar message payload, properties and encryptionCtx, and publishes JSON payload into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_FB`: Kinesis sink creates a flatbuffer serialized payload with Pulsar message payload, properties and encryptionCtx, and publishes flatbuffer payload into the configured Kinesis stream.
  • +`retainOrdering`|boolean|false|false|Whether Pulsar connectors to retain ordering when moving messages from Pulsar to Kinesis or not. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    It is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If it is empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`. +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Built-in plugins + +The following are built-in `AwsCredentialProviderPlugin` plugins: + +* `org.apache.pulsar.io.aws.AwsDefaultProviderChainPlugin` + + This plugin takes no configuration, it uses the default AWS provider chain. + + For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +* `org.apache.pulsar.io.aws.STSAssumeRoleProviderPlugin` + + This plugin takes a configuration (via the `awsCredentialPluginParam`) that describes a role to assume when running the KCL. + + This configuration takes the form of a small json document like: + + ```json + + {"roleArn": "arn...", "roleSessionName": "name"} + + ``` + +### Example + +Before using the Kinesis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "messageFormat": "ONLY_RAW_PAYLOAD", + "retainOrdering": "true" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + messageFormat: "ONLY_RAW_PAYLOAD" + retainOrdering: "true" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-kinesis-source.md b/site2/website/versioned_docs/version-2.9.x/io-kinesis-source.md new file mode 100644 index 0000000000000..0d07eefc3703b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-kinesis-source.md @@ -0,0 +1,81 @@ +--- +id: io-kinesis-source +title: Kinesis source connector +sidebar_label: "Kinesis source connector" +original_id: io-kinesis-source +--- + +The Kinesis source connector pulls data from Amazon Kinesis and persists data into Pulsar. + +This connector uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual consuming of messages. The KCL uses DynamoDB to track state for consumers. + +> Note: currently, the Kinesis source connector only supports raw messages. If you use KMS encrypted messages, the encrypted messages are sent to downstream. This connector will support decrypting messages in the future release. + + +## Configuration + +The configuration of the Kinesis source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record.
  • +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the Amazon Kinesis application.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the Kinesis stream checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`useEnhancedFanOut`|boolean|false|true|If set to true, it uses Kinesis enhanced fan-out.

    If set to false, it uses polling. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:AwsCredentialProviderPlugin:/pulsar-io/aws/src/main/java/org/apache/pulsar/io/aws/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`.
  • +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the Kinesis source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + + ``` + +* YAML + + ```yaml + + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-mongo-sink.md b/site2/website/versioned_docs/version-2.9.x/io-mongo-sink.md new file mode 100644 index 0000000000000..30c15a6c28093 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-mongo-sink.md @@ -0,0 +1,56 @@ +--- +id: io-mongo-sink +title: MongoDB sink connector +sidebar_label: "MongoDB sink connector" +original_id: io-mongo-sink +--- + +The MongoDB sink connector pulls messages from Pulsar topics +and persists the messages to collections. + +## Configuration + +The configuration of the MongoDB sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `mongoUri` | String| true| " " (empty string) | The MongoDB URI to which the connector connects.

    For more information, see [connection string URI format](https://docs.mongodb.com/manual/reference/connection-string/). | +| `database` | String| true| " " (empty string)| The database name to which the collection belongs. | +| `collection` | String| true| " " (empty string)| The collection name to which the connector writes messages. | +| `batchSize` | int|false|100 | The batch size of writing messages to collections. | +| `batchTimeMs` |long|false|1000| The batch operation interval in milliseconds. | + + +### Example + +Before using the Mongo sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "mongoUri": "mongodb://localhost:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": "2", + "batchTimeMs": "500" + } + + ``` + +* YAML + + ```yaml + + configs: + mongoUri: "mongodb://localhost:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-netty-source.md b/site2/website/versioned_docs/version-2.9.x/io-netty-source.md new file mode 100644 index 0000000000000..e1ec8d863115b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-netty-source.md @@ -0,0 +1,241 @@ +--- +id: io-netty-source +title: Netty source connector +sidebar_label: "Netty source connector" +original_id: io-netty-source +--- + +The Netty source connector opens a port that accepts incoming data via the configured network protocol +and publish it to user-defined Pulsar topics. + +This connector can be used in a containerized (for example, k8s) deployment. Otherwise, if the connector is running in process or thread mode, the instance may be conflicting on listening to ports. + +## Configuration + +The configuration of the Netty source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `type` |String| true |tcp | The network protocol over which data is transmitted to netty.

    Below are the available options:
  • tcp
  • http
  • udp
  • | +| `host` | String|true | 127.0.0.1 | The host name or address on which the source instance listen. | +| `port` | int|true | 10999 | The port on which the source instance listen. | +| `numberOfThreads` |int| true |1 | The number of threads of Netty TCP server to accept incoming connections and handle the traffic of accepted connections. | + + +### Example + +Before using the Netty source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "type": "tcp", + "host": "127.0.0.1", + "port": "10911", + "numberOfThreads": "1" + } + + ``` + +* YAML + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +## Usage + +The following examples show how to use the Netty source connector with TCP and HTTP. + +### TCP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ apt-get update + + $ apt-get -y install telnet + + $ root@1d19327b2c67:/pulsar# telnet 127.0.0.1 10999 + Trying 127.0.0.1... + Connected to 127.0.0.1. + Escape character is '^]'. + hello + world + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello + + ----- got message ----- + world + + ``` + +### HTTP + +1. Start Pulsar standalone. + + ```bash + + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + + configs: + type: "http" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + + ``` + +4. Download the Netty source connector. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + + ``` + +5. Start the Netty source connector. + + ```bash + + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + + ``` + +6. Consume data. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ curl -X POST --data 'hello, world!' http://127.0.0.1:10999/ + + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + + ----- got message ----- + hello, world! + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-nsq-source.md b/site2/website/versioned_docs/version-2.9.x/io-nsq-source.md new file mode 100644 index 0000000000000..b61e7e100c22e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-nsq-source.md @@ -0,0 +1,21 @@ +--- +id: io-nsq-source +title: NSQ source connector +sidebar_label: "NSQ source connector" +original_id: io-nsq-source +--- + +The NSQ source connector receives messages from NSQ topics +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the NSQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `lookupds` |String| true | " " (empty string) | A comma-separated list of nsqlookupds to connect to. | +| `topic` | String|true | " " (empty string) | The NSQ topic to transport. | +| `channel` | String |false | pulsar-transport-{$topic} | The channel to consume from on the provided NSQ topic. | \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/io-overview.md b/site2/website/versioned_docs/version-2.9.x/io-overview.md new file mode 100644 index 0000000000000..3db5ee34042d3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-overview.md @@ -0,0 +1,164 @@ +--- +id: io-overview +title: Pulsar connector overview +sidebar_label: "Overview" +original_id: io-overview +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Messaging systems are most powerful when you can easily use them with external systems like databases and other messaging systems. + +**Pulsar IO connectors** enable you to easily create, deploy, and manage connectors that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + + +## Concept + +Pulsar IO connectors come in two types: **source** and **sink**. + +This diagram illustrates the relationship between source, Pulsar, and sink: + +![Pulsar IO diagram](/assets/pulsar-io.png "Pulsar IO connectors (sources and sinks)") + + +### Source + +> Sources **feed data from external systems into Pulsar**. + +Common sources include other messaging systems and firehose-style data pipeline APIs. + +For the complete list of Pulsar built-in source connectors, see [source connector](io-connectors.md#source-connector). + +### Sink + +> Sinks **feed data from Pulsar into external systems**. + +Common sinks include other messaging systems and SQL and NoSQL databases. + +For the complete list of Pulsar built-in sink connectors, see [sink connector](io-connectors.md#sink-connector). + +## Processing guarantee + +Processing guarantees are used to handle errors when writing messages to Pulsar topics. + +> Pulsar connectors and Functions use the **same** processing guarantees as below. + +Delivery semantic | Description +:------------------|:------- +`at-most-once` | Each message sent to a connector is to be **processed once** or **not to be processed**. +`at-least-once` | Each message sent to a connector is to be **processed once** or **more than once**. +`effectively-once` | Each message sent to a connector has **one output associated** with it. + +> Processing guarantees for connectors not just rely on Pulsar guarantee but also **relate to external systems**, that is, **the implementation of source and sink**. + +* Source: Pulsar ensures that writing messages to Pulsar topics respects to the processing guarantees. It is within Pulsar's control. + +* Sink: the processing guarantees rely on the sink implementation. If the sink implementation does not handle retries in an idempotent way, the sink does not respect to the processing guarantees. + +### Set + +When creating a connector, you can set the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +> If `--processing-guarantees` is not specified when creating a connector, the default semantic is `ATLEAST_ONCE`. + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources create \ + --processing-guarantees ATMOST_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources create`, see [here](reference-connector-admin.md#create). + + + + +```bash + +$ bin/pulsar-admin sinks create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks create`, see [here](reference-connector-admin.md#create-1). + + + + +```` + +### Update + +After creating a connector, you can update the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + +````mdx-code-block + + + + +```bash + +$ bin/pulsar-admin sources update \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other source configs + +``` + +For more information about the options of `pulsar-admin sources update`, see [here](reference-connector-admin.md#update). + + + + +```bash + +$ bin/pulsar-admin sinks update \ + --processing-guarantees ATMOST_ONCE \ + # Other sink configs + +``` + +For more information about the options of `pulsar-admin sinks update`, see [here](reference-connector-admin.md#update-1). + + + + +```` + + +## Work with connector + +You can manage Pulsar connectors (for example, create, update, start, stop, restart, reload, delete and perform other operations on connectors) via the [Connector Admin CLI](reference-connector-admin.md) with [sources](io-cli.md#sources) and [sinks](io-cli.md#sinks) subcommands. + +Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via [Connector Admin CLI](reference-connector-admin.md) or [Functions Admin CLI](functions-cli.md), an instance is started on a worker. For more information, see [Functions worker](functions-worker.md#run-functions-worker-separately). + diff --git a/site2/website/versioned_docs/version-2.9.x/io-quickstart.md b/site2/website/versioned_docs/version-2.9.x/io-quickstart.md new file mode 100644 index 0000000000000..9efa367843b5a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-quickstart.md @@ -0,0 +1,955 @@ +--- +id: io-quickstart +title: How to connect Pulsar to database +sidebar_label: "Get started" +original_id: io-quickstart +--- + +This tutorial provides a hands-on look at how you can move data out of Pulsar without writing a single line of code. + +It is helpful to review the [concepts](io-overview.md) for Pulsar I/O with running the steps in this guide to gain a deeper understanding. + +At the end of this tutorial, you are able to: + +- [Connect Pulsar to Cassandra](#Connect-Pulsar-to-Cassandra) + +- [Connect Pulsar to PostgreSQL](#Connect-Pulsar-to-PostgreSQL) + +:::tip + +* These instructions assume you are running Pulsar in [standalone mode](getting-started-standalone.md). However, all +the commands used in this tutorial can be used in a multi-nodes Pulsar cluster without any changes. +* All the instructions are assumed to run at the root directory of a Pulsar binary distribution. + +::: + +## Install Pulsar and built-in connector + +Before connecting Pulsar to a database, you need to install Pulsar and the desired built-in connector. + +For more information about **how to install a standalone Pulsar and built-in connectors**, see [here](getting-started-standalone.md/#installing-pulsar). + +## Start Pulsar standalone + +1. Start Pulsar locally. + + ```bash + + bin/pulsar standalone + + ``` + + All the components of a Pulsar service are start in order. + + You can curl those pulsar service endpoints to make sure Pulsar service is up running correctly. + +2. Check Pulsar binary protocol port. + + ```bash + + telnet localhost 6650 + + ``` + +3. Check Pulsar Function cluster. + + ```bash + + curl -s http://localhost:8080/admin/v2/worker/cluster + + ``` + + **Example output** + + ```json + + [{"workerId":"c-standalone-fw-localhost-6750","workerHostname":"localhost","port":6750}] + + ``` + +4. Make sure a public tenant and a default namespace exist. + + ```bash + + curl -s http://localhost:8080/admin/v2/namespaces/public + + ``` + + **Example output** + + ```json + + ["public/default","public/functions"] + + ``` + +5. All built-in connectors should be listed as available. + + ```bash + + curl -s http://localhost:8080/admin/v2/functions/connectors + + ``` + + **Example output** + + ```json + + [{"name":"aerospike","description":"Aerospike database sink","sinkClass":"org.apache.pulsar.io.aerospike.AerospikeStringSink"},{"name":"cassandra","description":"Writes data into Cassandra","sinkClass":"org.apache.pulsar.io.cassandra.CassandraStringSink"},{"name":"kafka","description":"Kafka source and sink connector","sourceClass":"org.apache.pulsar.io.kafka.KafkaStringSource","sinkClass":"org.apache.pulsar.io.kafka.KafkaBytesSink"},{"name":"kinesis","description":"Kinesis sink connector","sinkClass":"org.apache.pulsar.io.kinesis.KinesisSink"},{"name":"rabbitmq","description":"RabbitMQ source connector","sourceClass":"org.apache.pulsar.io.rabbitmq.RabbitMQSource"},{"name":"twitter","description":"Ingest data from Twitter firehose","sourceClass":"org.apache.pulsar.io.twitter.TwitterFireHose"}] + + ``` + + If an error occurs when starting Pulsar service, you may see an exception at the terminal running `pulsar/standalone`, + or you can navigate to the `logs` directory under the Pulsar directory to view the logs. + +## Connect Pulsar to Cassandra + +This section demonstrates how to connect Pulsar to Cassandra. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The Cassandra sink connector reads messages from Pulsar topics and writes the messages into Cassandra tables. For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +::: + +### Setup a Cassandra cluster + +This example uses `cassandra` Docker image to start a single-node Cassandra cluster in Docker. + +1. Start a Cassandra cluster. + + ```bash + + docker run -d --rm --name=cassandra -p 9042:9042 cassandra + + ``` + + :::note + + Before moving to the next steps, make sure the Cassandra cluster is running. + + ::: + +2. Make sure the Docker process is running. + + ```bash + + docker ps + + ``` + +3. Check the Cassandra logs to make sure the Cassandra process is running as expected. + + ```bash + + docker logs cassandra + + ``` + +4. Check the status of the Cassandra cluster. + + ```bash + + docker exec cassandra nodetool status + + ``` + + **Example output** + + ``` + + Datacenter: datacenter1 + ======================= + Status=Up/Down + |/ State=Normal/Leaving/Joining/Moving + -- Address Load Tokens Owns (effective) Host ID Rack + UN 172.17.0.2 103.67 KiB 256 100.0% af0e4b2f-84e0-4f0b-bb14-bd5f9070ff26 rack1 + + ``` + +5. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + $ docker exec -ti cassandra cqlsh localhost + Connected to Test Cluster at localhost:9042. + [cqlsh 5.0.1 | Cassandra 3.11.2 | CQL spec 3.4.4 | Native protocol v4] + Use HELP for help. + cqlsh> + + ``` + +6. Create a keyspace `pulsar_test_keyspace`. + + ```bash + + cqlsh> CREATE KEYSPACE pulsar_test_keyspace WITH replication = {'class':'SimpleStrategy', 'replication_factor':1}; + + ``` + +7. Create a table `pulsar_test_table`. + + ```bash + + cqlsh> USE pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> CREATE TABLE pulsar_test_table (key text PRIMARY KEY, col text); + + ``` + +### Configure a Cassandra sink + +Now that we have a Cassandra cluster running locally. + +In this section, you need to configure a Cassandra sink connector. + +To run a Cassandra sink connector, you need to prepare a configuration file including the information that Pulsar connector runtime needs to know. + +For example, how Pulsar connector can find the Cassandra cluster, what is the keyspace and the table that Pulsar connector uses for writing Pulsar messages to, and so on. + +You can create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + + ``` + +* YAML + + ```yaml + + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + + ``` + +For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +### Create a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) +to create a sink connector and perform other operations on them. + +Run the following command to create a Cassandra sink connector with sink type _cassandra_ and the config file _examples/cassandra-sink.yml_ created previously. + +#### Note +> The `sink-type` parameter of the currently built-in connectors is determined by the setting of the `name` parameter specified in the pulsar-io.yaml file. + +```bash + +bin/pulsar-admin sinks create \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink \ + --sink-type cassandra \ + --sink-config-file examples/cassandra-sink.yml \ + --inputs test_cassandra + +``` + +Once the command is executed, Pulsar creates the sink connector _cassandra-test-sink_. + +This sink connector runs +as a Pulsar Function and writes the messages produced in the topic _test_cassandra_ to the Cassandra table _pulsar_test_table_. + +### Inspect a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) to monitor a connector and perform other operations on it. + +* Get the information of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "cassandra-test-sink", + "className": "org.apache.pulsar.io.cassandra.CassandraStringSink", + "inputSpecs": { + "test_cassandra": { + "isRegexPattern": false + } + }, + "configs": { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true, + "archive": "builtin://cassandra" + } + + ``` + +* Check the status of a Cassandra sink. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +### Verify a Cassandra sink + +1. Produce some messages to the input topic of the Cassandra sink _test_cassandra_. + + ```bash + + for i in {0..9}; do bin/pulsar-client produce -m "key-$i" -n 1 test_cassandra; done + + ``` + +2. Inspect the status of the Cassandra sink _test_cassandra_. + + ```bash + + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + + ``` + + You can see 10 messages are processed by the Cassandra sink _test_cassandra_. + + **Example output** + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 10, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 10, + "lastReceivedTime" : 1551685489136, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +3. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + + docker exec -ti cassandra cqlsh localhost + + ``` + +4. Check the data of the Cassandra table _pulsar_test_table_. + + ```bash + + cqlsh> use pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> select * from pulsar_test_table; + + key | col + --------+-------- + key-5 | key-5 + key-0 | key-0 + key-9 | key-9 + key-2 | key-2 + key-1 | key-1 + key-3 | key-3 + key-6 | key-6 + key-7 | key-7 + key-4 | key-4 + key-8 | key-8 + + ``` + +### Delete a Cassandra Sink + +You can use the [Connector Admin CLI](io-cli.md) to delete a connector and perform other operations on it. + +```bash + +bin/pulsar-admin sinks delete \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + +``` + +## Connect Pulsar to PostgreSQL + +This section demonstrates how to connect Pulsar to PostgreSQL. + +:::tip + +* Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +* The JDBC sink connector pulls messages from Pulsar topics and persists the messages to ClickHouse, MariaDB, PostgreSQL, or SQlite. + +For more information, see [JDBC sink connector](io-jdbc-sink.md). + +::: + + +### Setup a PostgreSQL cluster + +This example uses the PostgreSQL 12 docker image to start a single-node PostgreSQL cluster in Docker. + +1. Pull the PostgreSQL 12 image from Docker. + + ```bash + + $ docker pull postgres:12 + + ``` + +2. Start PostgreSQL. + + ```bash + + $ docker run -d -it --rm \ + --name pulsar-postgres \ + -p 5432:5432 \ + -e POSTGRES_PASSWORD=password \ + -e POSTGRES_USER=postgres \ + postgres:12 + + ``` + + #### Tip + + Flag | Description | This example + ---|---|---| + `-d` | To start a container in detached mode. | / + `-it` | Keep STDIN open even if not attached and allocate a terminal. | / + `--rm` | Remove the container automatically when it exits. | / + `-name` | Assign a name to the container. | This example specifies _pulsar-postgres_ for the container. + `-p` | Publish the port of the container to the host. | This example publishes the port _5432_ of the container to the host. + `-e` | Set environment variables. | This example sets the following variables:
    - The password for the user is _password_.
    - The name for the user is _postgres_. + + :::tip + + For more information about Docker commands, see [Docker CLI](https://docs.docker.com/engine/reference/commandline/run/). + + ::: + +3. Check if PostgreSQL has been started successfully. + + ```bash + + $ docker logs -f pulsar-postgres + + ``` + + PostgreSQL has been started successfully if the following message appears. + + ```text + + 2020-05-11 20:09:24.492 UTC [1] LOG: starting PostgreSQL 12.2 (Debian 12.2-2.pgdg100+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 8.3.0-6) 8.3.0, 64-bit + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv4 address "0.0.0.0", port 5432 + 2020-05-11 20:09:24.492 UTC [1] LOG: listening on IPv6 address "::", port 5432 + 2020-05-11 20:09:24.499 UTC [1] LOG: listening on Unix socket "/var/run/postgresql/.s.PGSQL.5432" + 2020-05-11 20:09:24.523 UTC [55] LOG: database system was shut down at 2020-05-11 20:09:24 UTC + 2020-05-11 20:09:24.533 UTC [1] LOG: database system is ready to accept connections + + ``` + +4. Access to PostgreSQL. + + ```bash + + $ docker exec -it pulsar-postgres /bin/bash + + ``` + +5. Create a PostgreSQL table _pulsar_postgres_jdbc_sink_. + + ```bash + + $ psql -U postgres postgres + + postgres=# create table if not exists pulsar_postgres_jdbc_sink + ( + id serial PRIMARY KEY, + name VARCHAR(255) NOT NULL + ); + + ``` + +### Configure a JDBC sink + +Now we have a PostgreSQL running locally. + +In this section, you need to configure a JDBC sink connector. + +1. Add a configuration file. + + To run a JDBC sink connector, you need to prepare a YAML configuration file including the information that Pulsar connector runtime needs to know. + + For example, how Pulsar connector can find the PostgreSQL cluster, what is the JDBC URL and the table that Pulsar connector uses for writing messages to. + + Create a _pulsar-postgres-jdbc-sink.yaml_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```yaml + + configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/postgres" + tableName: "pulsar_postgres_jdbc_sink" + + ``` + +2. Create a schema. + + Create a _avro-schema_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```json + + { + "type": "AVRO", + "schema": "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}", + "properties": {} + } + + ``` + + :::tip + + For more information about AVRO, see [Apache Avro](https://avro.apache.org/docs/1.9.1/). + + ::: + +3. Upload a schema to a topic. + + This example uploads the _avro-schema_ schema to the _pulsar-postgres-jdbc-sink-topic_ topic. + + ```bash + + $ bin/pulsar-admin schemas upload pulsar-postgres-jdbc-sink-topic -f ./connectors/avro-schema + + ``` + +4. Check if the schema has been uploaded successfully. + + ```bash + + $ bin/pulsar-admin schemas get pulsar-postgres-jdbc-sink-topic + + ``` + + The schema has been uploaded successfully if the following message appears. + + ```json + + {"name":"pulsar-postgres-jdbc-sink-topic","schema":"{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}","type":"AVRO","properties":{}} + + ``` + +### Create a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to create a sink connector and perform other operations on it. + +This example creates a sink connector and specifies the desired information. + +```bash + +$ bin/pulsar-admin sinks create \ +--archive ./connectors/pulsar-io-jdbc-postgres-@pulsar:version@.nar \ +--inputs pulsar-postgres-jdbc-sink-topic \ +--name pulsar-postgres-jdbc-sink \ +--sink-config-file ./connectors/pulsar-postgres-jdbc-sink.yaml \ +--parallelism 1 + +``` + +Once the command is executed, Pulsar creates a sink connector _pulsar-postgres-jdbc-sink_. + +This sink connector runs as a Pulsar Function and writes the messages produced in the topic _pulsar-postgres-jdbc-sink-topic_ to the PostgreSQL table _pulsar_postgres_jdbc_sink_. + + #### Tip + + Flag | Description | This example + ---|---|---| + `--archive` | The path to the archive file for the sink. | _pulsar-io-jdbc-postgres-@pulsar:version@.nar_ | + `--inputs` | The input topic(s) of the sink.

    Multiple topics can be specified as a comma-separated list.|| + `--name` | The name of the sink. | _pulsar-postgres-jdbc-sink_ | + `--sink-config-file` | The path to a YAML config file specifying the configuration of the sink. | _pulsar-postgres-jdbc-sink.yaml_ | + `--parallelism` | The parallelism factor of the sink.

    For example, the number of sink instances to run. | _1_ | + +:::tip + +For more information about `pulsar-admin sinks create options`, see [here](io-cli.md#sinks). + +::: + +The sink has been created successfully if the following message appears. + +```bash + +"Created successfully" + +``` + +### Inspect a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to monitor a connector and perform other operations on it. + +* List all running JDBC sink(s). + + ```bash + + $ bin/pulsar-admin sinks list \ + --tenant public \ + --namespace default + + ``` + + :::tip + + For more information about `pulsar-admin sinks list options`, see [here](io-cli.md/#list-1). + + ::: + + The result shows that only the _postgres-jdbc-sink_ sink is running. + + ```json + + [ + "pulsar-postgres-jdbc-sink" + ] + + ``` + +* Get the information of a JDBC sink. + + ```bash + + $ bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks get options`, see [here](io-cli.md/#get-1). + + ::: + + The result shows the information of the sink connector, including tenant, namespace, topic and so on. + + ```json + + { + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true + } + + ``` + +* Get the status of a JDBC sink + + ```bash + + $ bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name pulsar-postgres-jdbc-sink + + ``` + + :::tip + + For more information about `pulsar-admin sinks status options`, see [here](io-cli.md/#status-1). + + ::: + + The result shows the current status of sink connector, including the number of instance, running status, worker ID and so on. + + ```json + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-192.168.2.52-8080" + } + } ] + } + + ``` + +### Stop a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to stop a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks stop \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks stop options`, see [here](io-cli.md/#stop-1). + +::: + +The sink instance has been stopped successfully if the following message disappears. + +```bash + +"Stopped successfully" + +``` + +### Restart a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to restart a connector and perform other operations on it. + +```bash + +$ bin/pulsar-admin sinks restart \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks restart options`, see [here](io-cli.md/#restart-1). + +::: + +The sink instance has been started successfully if the following message disappears. + +```bash + +"Started successfully" + +``` + +:::tip + +* Optionally, you can run a standalone sink connector using `pulsar-admin sinks localrun options`. +Note that `pulsar-admin sinks localrun options` **runs a sink connector locally**, while `pulsar-admin sinks start options` **starts a sink connector in a cluster**. +* For more information about `pulsar-admin sinks localrun options`, see [here](io-cli.md#localrun-1). + +::: + +### Update a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to update a connector and perform other operations on it. + +This example updates the parallelism of the _pulsar-postgres-jdbc-sink_ sink connector to 2. + +```bash + +$ bin/pulsar-admin sinks update \ +--name pulsar-postgres-jdbc-sink \ +--parallelism 2 + +``` + +:::tip + +For more information about `pulsar-admin sinks update options`, see [here](io-cli.md/#update-1). + +::: + +The sink connector has been updated successfully if the following message disappears. + +```bash + +"Updated successfully" + +``` + +This example double-checks the information. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the parallelism is 2. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 2, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +### Delete a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) to delete a connector and perform other operations on it. + +This example deletes the _pulsar-postgres-jdbc-sink_ sink connector. + +```bash + +$ bin/pulsar-admin sinks delete \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +:::tip + +For more information about `pulsar-admin sinks delete options`, see [here](io-cli.md/#delete-1). + +::: + +The sink connector has been deleted successfully if the following message appears. + +```text + +"Deleted successfully" + +``` + +This example double-checks the status of the sink connector. + +```bash + +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-postgres-jdbc-sink + +``` + +The result shows that the sink connector does not exist. + +```text + +HTTP 404 Not Found + +Reason: Sink pulsar-postgres-jdbc-sink doesn't exist + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-sink.md b/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-sink.md new file mode 100644 index 0000000000000..d7fda99460dc9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-sink.md @@ -0,0 +1,85 @@ +--- +id: io-rabbitmq-sink +title: RabbitMQ sink connector +sidebar_label: "RabbitMQ sink connector" +original_id: io-rabbitmq-sink +--- + +The RabbitMQ sink connector pulls messages from Pulsar topics +and persist the messages to RabbitMQ queues. + + +## Configuration + +The configuration of the RabbitMQ sink connector has the following properties. + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The exchange to publish messages. | +| `exchangeName` | String|true | " " (empty string) | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` |String|true | " " (empty string) |The routing key used to publish messages. | + + +### Example + +Before using the RabbitMQ sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "exchangeName": "test-exchange", + "routingKey": "test-key" + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/", + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + exchangeName: "test-exchange" + routingKey: "test-key" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-source.md b/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-source.md new file mode 100644 index 0000000000000..c2c31cc97d10d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-rabbitmq-source.md @@ -0,0 +1,85 @@ +--- +id: io-rabbitmq-source +title: RabbitMQ source connector +sidebar_label: "RabbitMQ source connector" +original_id: io-rabbitmq-source +--- + +The RabbitMQ source connector receives messages from RabbitMQ clusters +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the RabbitMQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The requested heartbeat timeout in seconds. | +| `prefetchCount` | int|false | 0 | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` | boolean|false | false |Whether the setting should be applied to the entire channel rather than each consumer. | +| `passive` | boolean|false | false | Whether the rabbitmq consumer should create its own queue or bind to an existing one. | + +### Example + +Before using the RabbitMQ source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "prefetchCount": "0", + "prefetchGlobal": "false", + "passive": "false" + } + + ``` + +* YAML + + ```yaml + + configs: + host: "localhost" + port: 5672 + virtualHost: "/" + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + prefetchCount: 0 + prefetchGlobal: "false" + passive: "false" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-redis-sink.md b/site2/website/versioned_docs/version-2.9.x/io-redis-sink.md new file mode 100644 index 0000000000000..0caf21bcf62e8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-redis-sink.md @@ -0,0 +1,156 @@ +--- +id: io-redis-sink +title: Redis sink connector +sidebar_label: "Redis sink connector" +original_id: io-redis-sink +--- + +The Redis sink connector pulls messages from Pulsar topics +and persists the messages to a Redis database. + + + +## Configuration + +The configuration of the Redis sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `redisHosts` |String|true|" " (empty string) | A comma-separated list of Redis hosts to connect to. | +| `redisPassword` |String|false|" " (empty string) | The password used to connect to Redis. | +| `redisDatabase` | int|true|0 | The Redis database to connect to. | +| `clientMode` |String| false|Standalone | The client mode when interacting with Redis cluster.

    Below are the available options:
  • Standalone
  • Cluster
  • | +| `autoReconnect` | boolean|false|true | Whether the Redis client automatically reconnect or not. | +| `requestQueue` | int|false|2147483647 | The maximum number of queued requests to Redis. | +| `tcpNoDelay` |boolean| false| false | Whether to enable TCP with no delay or not. | +| `keepAlive` | boolean|false | false |Whether to enable a keepalive to Redis or not. | +| `connectTimeout` |long| false|10000 | The time to wait before timing out when connecting in milliseconds. | +| `operationTimeout` | long|false|10000 | The time before an operation is marked as timed out in milliseconds . | +| `batchTimeMs` | int|false|1000 | The Redis operation time in milliseconds. | +| `batchSize` | int|false|200 | The batch size of writing to Redis database. | + + +### Example + +Before using the Redis sink connector, you need to create a configuration file in the path you will start Pulsar service (i.e. `PULSAR_HOME`) through one of the following methods. + +* JSON + + ```json + + { + "redisHosts": "localhost:6379", + "redisPassword": "mypassword", + "redisDatabase": "0", + "clientMode": "Standalone", + "operationTimeout": "2000", + "batchSize": "1", + "batchTimeMs": "1000", + "connectTimeout": "3000" + } + + ``` + +* YAML + + ```yaml + + configs: + redisHosts: "localhost:6379" + redisPassword: "mypassword" + redisDatabase: 0 + clientMode: "Standalone" + operationTimeout: 2000 + batchSize: 1 + batchTimeMs: 1000 + connectTimeout: 3000 + + ``` + +### Usage + +This example shows how to write records to a Redis database using the Pulsar Redis connector. + +1. Start a Redis server. + + ```bash + + $ docker pull redis:5.0.5 + $ docker run -d -p 6379:6379 --name my-redis redis:5.0.5 --requirepass "mypassword" + + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + + $ bin/pulsar standalone + + ``` + + Make sure the NAR file is available at `connectors/pulsar-io-redis-@pulsar:version@.nar`. + +3. Start the Pulsar Redis connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-redis-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name my-redis-sink \ + --sink-config '{"redisHosts": "localhost:6379","redisPassword": "mypassword","redisDatabase": "0","clientMode": "Standalone","operationTimeout": "3000","batchSize": "1"}' \ + --inputs my-redis-topic + + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + + $ bin/pulsar-admin sinks localrun \ + --archive connectors/pulsar-io-redis-@pulsar:version@.nar \ + --tenant public \ + --namespace default \ + --name my-redis-sink \ + --sink-config-file redis-sink-config.yaml \ + --inputs my-redis-topic + + ``` + +4. Publish records to the topic. + + ```bash + + $ bin/pulsar-client produce \ + persistent://public/default/my-redis-topic \ + -k "streaming" \ + -m "Pulsar" + + ``` + +5. Start a Redis client in Docker. + + ```bash + + $ docker exec -it my-redis redis-cli -a "mypassword" + + ``` + +6. Check the key/value in Redis. + + ``` + + 127.0.0.1:6379> keys * + 1) "streaming" + 127.0.0.1:6379> get "streaming" + "Pulsar" + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-solr-sink.md b/site2/website/versioned_docs/version-2.9.x/io-solr-sink.md new file mode 100644 index 0000000000000..df2c3612c38eb --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-solr-sink.md @@ -0,0 +1,65 @@ +--- +id: io-solr-sink +title: Solr sink connector +sidebar_label: "Solr sink connector" +original_id: io-solr-sink +--- + +The Solr sink connector pulls messages from Pulsar topics +and persists the messages to Solr collections. + + + +## Configuration + +The configuration of the Solr sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `solrUrl` | String|true|" " (empty string) |
  • Comma-separated zookeeper hosts with chroot used in the SolrCloud mode.
    **Example**
    `localhost:2181,localhost:2182/chroot`

  • URL to connect to Solr used in standalone mode.
    **Example**
    `localhost:8983/solr`
  • | +| `solrMode` | String|true|SolrCloud| The client mode when interacting with the Solr cluster.

    Below are the available options:
  • Standalone
  • SolrCloud
  • | +| `solrCollection` |String|true| " " (empty string) | Solr collection name to which records need to be written. | +| `solrCommitWithinMs` |int| false|10 | The time within million seconds for Solr updating commits.| +| `username` |String|false| " " (empty string) | The username for basic authentication.

    **Note: `usename` is case-sensitive.** | +| `password` | String|false| " " (empty string) | The password for basic authentication.

    **Note: `password` is case-sensitive.** | + + + +### Example + +Before using the Solr sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + + { + "solrUrl": "localhost:2181,localhost:2182/chroot", + "solrMode": "SolrCloud", + "solrCollection": "techproducts", + "solrCommitWithinMs": 100, + "username": "fakeuser", + "password": "fake@123" + } + + ``` + +* YAML + + ```yaml + + { + solrUrl: "localhost:2181,localhost:2182/chroot" + solrMode: "SolrCloud" + solrCollection: "techproducts" + solrCommitWithinMs: 100 + username: "fakeuser" + password: "fake@123" + } + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/io-twitter-source.md b/site2/website/versioned_docs/version-2.9.x/io-twitter-source.md new file mode 100644 index 0000000000000..8de3504dd0fef --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-twitter-source.md @@ -0,0 +1,28 @@ +--- +id: io-twitter-source +title: Twitter Firehose source connector +sidebar_label: "Twitter Firehose source connector" +original_id: io-twitter-source +--- + +The Twitter Firehose source connector receives tweets from Twitter Firehose and +writes the tweets to Pulsar topics. + +## Configuration + +The configuration of the Twitter Firehose source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `consumerKey` | String|true | " " (empty string) | The twitter OAuth consumer key.

    For more information, see [Access tokens](https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens). | +| `consumerSecret` | String |true | " " (empty string) | The twitter OAuth consumer secret. | +| `token` | String|true | " " (empty string) | The twitter OAuth token. | +| `tokenSecret` | String|true | " " (empty string) | The twitter OAuth secret. | +| `guestimateTweetTime`|Boolean|false|false|Most firehose events have null createdAt time.

    If `guestimateTweetTime` set to true, the connector estimates the createdTime of each firehose event to be current time. +| `clientName` | String |false | openconnector-twitter-source| The twitter firehose client name. | +| `clientHosts` |String| false | Constants.STREAM_HOST | The twitter firehose hosts to which client connects. | +| `clientBufferSize` | int|false | 50000 | The buffer size for buffering tweets fetched from twitter firehose. | + +> For more information about OAuth credentials, see [Twitter developers portal](https://developer.twitter.com/en.html). diff --git a/site2/website/versioned_docs/version-2.9.x/io-twitter.md b/site2/website/versioned_docs/version-2.9.x/io-twitter.md new file mode 100644 index 0000000000000..3b2f6325453c3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-twitter.md @@ -0,0 +1,7 @@ +--- +id: io-twitter +title: Twitter Firehose Connector +sidebar_label: "Twitter Firehose Connector" +original_id: io-twitter +--- + diff --git a/site2/website/versioned_docs/version-2.9.x/io-use.md b/site2/website/versioned_docs/version-2.9.x/io-use.md new file mode 100644 index 0000000000000..cae89c6723ea3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/io-use.md @@ -0,0 +1,1787 @@ +--- +id: io-use +title: How to use Pulsar connectors +sidebar_label: "Use" +original_id: io-use +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide describes how to use Pulsar connectors. + +## Install a connector + +Pulsar bundles several [builtin connectors](io-connectors.md) used to move data in and out of commonly used systems (such as database and messaging system). Optionally, you can create and use your desired non-builtin connectors. + +:::note + +When using a non-builtin connector, you need to specify the path of a archive file for the connector. + +::: + +To set up a builtin connector, follow +the instructions [here](getting-started-standalone.md#installing-builtin-connectors). + +After the setup, the builtin connector is automatically discovered by Pulsar brokers (or function-workers), so no additional installation steps are required. + +## Configure a connector + +You can configure the following information: + +* [Configure a default storage location for a connector](#configure-a-default-storage-location-for-a-connector) + +* [Configure a connector with a YAML file](#configure-a-connector-with-yaml-file) + +### Configure a default storage location for a connector + +To configure a default folder for builtin connectors, set the `connectorsDirectory` parameter in the `./conf/functions_worker.yml` configuration file. + +**Example** + +Set the `./connectors` folder as the default storage location for builtin connectors. + +``` + +######################## +# Connectors +######################## + +connectorsDirectory: ./connectors + +``` + +### Configure a connector with a YAML file + +To configure a connector, you need to provide a YAML configuration file when creating a connector. + +The YAML configuration file tells Pulsar where to locate connectors and how to connect connectors with Pulsar topics. + +**Example 1** + +Below is a YAML configuration file of a Cassandra sink, which tells Pulsar: + +* Which Cassandra cluster to connect + +* What is the `keyspace` and `columnFamily` to be used in Cassandra for collecting data + +* How to map Pulsar messages into Cassandra table key and columns + +```shell + +tenant: public +namespace: default +name: cassandra-test-sink +... +# cassandra specific config +configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + +``` + +**Example 2** + +Below is a YAML configuration file of a Kafka source. + +```shell + +configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: "false" + +``` + +**Example 3** + +Below is a YAML configuration file of a PostgreSQL JDBC sink. + +```shell + +configs: + userName: "postgres" + password: "password" + jdbcUrl: "jdbc:postgresql://localhost:5432/test_jdbc" + tableName: "test_jdbc" + +``` + +## Get available connectors + +Before starting using connectors, you can perform the following operations: + +* [Reload connectors](#reload) + +* [Get a list of available connectors](#get-available-connectors) + +### `reload` + +If you add or delete a nar file in a connector folder, reload the available builtin connector before using it. + +#### Source + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sources reload + +``` + +For more information, see [`here`](io-cli.md#reload). + +#### Sink + +Use the `reload` subcommand. + +```shell + +$ pulsar-admin sinks reload + +``` + +For more information, see [`here`](io-cli.md#reload-1). + +### `available` + +After reloading connectors (optional), you can get a list of available connectors. + +#### Source + +Use the `available-sources` subcommand. + +```shell + +$ pulsar-admin sources available-sources + +``` + +#### Sink + +Use the `available-sinks` subcommand. + +```shell + +$ pulsar-admin sinks available-sinks + +``` + +## Run a connector + +To run a connector, you can perform the following operations: + +* [Create a connector](#create) + +* [Start a connector](#start) + +* [Run a connector locally](#localrun) + +### `create` + +You can create a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Create a source connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sources create options + +``` + +For more information, see [here](io-cli.md#create). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/registerSource?version=@pulsar:version_number@} + + + + +* Create a source connector with a **local file**. + + ```java + + void createSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSource-SourceConfig-java.lang.String-). + +* Create a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sourceConfig` | The source configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Create a sink connector. + +````mdx-code-block + + + + +Use the `create` subcommand. + +``` + +$ pulsar-admin sinks create options + +``` + +For more information, see [here](io-cli.md#create-1). + + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/registerSink?version=@pulsar:version_number@} + + + + +* Create a sink connector with a **local file**. + + ```java + + void createSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + |Name|Description + |---|--- + `sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSink-SinkConfig-java.lang.String-). + +* Create a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void createSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sinkConfig` | The sink configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +### `start` + +You can start a connector using **Admin CLI** or **REST API**. + +#### Source + +Start a source connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sources start options + +``` + +For more information, see [here](io-cli.md#start). + + + + +* Start **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/start|operation/startSource?version=@pulsar:version_number@} + +* Start a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSource?version=@pulsar:version_number@} + + + + +```` + +#### Sink + +Start a sink connector. + +````mdx-code-block + + + + +Use the `start` subcommand. + +``` + +$ pulsar-admin sinks start options + +``` + +For more information, see [here](io-cli.md#start-1). + + + + +* Start **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/start|operation/startSink?version=@pulsar:version_number@} + +* Start a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSink?version=@pulsar:version_number@} + + + + +```` + +### `localrun` + +You can run a connector locally rather than deploying it on a Pulsar cluster using **Admin CLI**. + +#### Source + +Run a source connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sources localrun options + +``` + +For more information, see [here](io-cli.md#localrun). + + + + +```` + +#### Sink + +Run a sink connector locally. + +````mdx-code-block + + + + +Use the `localrun` subcommand. + +``` + +$ pulsar-admin sinks localrun options + +``` + +For more information, see [here](io-cli.md#localrun-1). + + + + +```` + +## Monitor a connector + +To monitor a connector, you can perform the following operations: + +* [Get the information of a connector](#get) + +* [Get the list of all running connectors](#list) + +* [Get the current status of a connector](#status) + +### `get` + +You can get the information of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the information of a source connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sources get options + +``` + +For more information, see [here](io-cli.md#get). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/getSourceInfo?version=@pulsar:version_number@} + + + + +```java + +SourceConfig getSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Example** + +This is a sourceConfig. + +```java + +{ + "tenant": "tenantName", + "namespace": "namespaceName", + "name": "sourceName", + "className": "className", + "topicName": "topicName", + "configs": {}, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +This is a sourceConfig example. + +``` + +{ + "tenant": "public", + "namespace": "default", + "name": "debezium-mysql-source", + "className": "org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource", + "topicName": "debezium-mysql-topic", + "configs": { + "database.user": "debezium", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.port": "3306", + "database.hostname": "localhost", + "database.password": "dbz", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.whitelist": "inventory", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "database.history.pulsar.topic": "history-topic2" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException.NotFoundException` | Cluster doesn't exist +`PulsarAdminException` | Unexpected error + +For more information, see [`getSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the information of a sink connector. + +````mdx-code-block + + + + +Use the `get` subcommand. + +``` + +$ pulsar-admin sinks get options + +``` + +For more information, see [here](io-cli.md#get-1). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/getSinkInfo?version=@pulsar:version_number@} + + + + +```java + +SinkConfig getSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + +``` + +**Example** + +This is a sinkConfig. + +```json + +{ +"tenant": "tenantName", +"namespace": "namespaceName", +"name": "sinkName", +"className": "className", +"inputSpecs": { +"topicName": { + "isRegexPattern": false +} +}, +"configs": {}, +"parallelism": 1, +"processingGuarantees": "ATLEAST_ONCE", +"retainOrdering": false, +"autoAck": true +} + +``` + +This is a sinkConfig example. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-postgres-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.PostgresJdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-postgres-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "password", + "jdbcUrl": "jdbc:postgresql://localhost:5432/pulsar_postgres_jdbc_sink", + "userName": "postgres", + "tableName": "pulsar_postgres_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} + +``` + +**Parameter description** + +Name| Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +For more information, see [`getSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +### `list` + +You can get the list of all running connectors using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the list of all running source connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sources list options + +``` + +For more information, see [here](io-cli.md#list). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/|operation/listSources?version=@pulsar:version_number@} + + + + +```java + +List listSources(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#listSources-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Get the list of all running sink connectors. + +````mdx-code-block + + + + +Use the `list` subcommand. + +``` + +$ pulsar-admin sinks list options + +``` + +For more information, see [here](io-cli.md#list-1). + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/|operation/listSinks?version=@pulsar:version_number@} + + + + +```java + +List listSinks(String tenant, + String namespace) + throws PulsarAdminException + +``` + +**Response example** + +```java + +["f1", "f2", "f3"] + +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#listSinks-java.lang.String-java.lang.String-). + + + + +```` + +### `status` + +You can get the current status of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the current status of a source connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sources status options + +``` + +For more information, see [here](io-cli.md#status). + + + + +* Get the current status of **all** source connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/status|operation/getSourceStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** source connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSourceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** source connectors. + + ```java + + SourceStatus getSourceStatus(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SourceStatus.SourceInstanceStatus.SourceInstanceStatusData getSourceStatus(String tenant, + String namespace, + String source, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Source instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSourceStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Get the current status of a Pulsar sink connector. + +````mdx-code-block + + + + +Use the `status` subcommand. + +``` + +$ pulsar-admin sinks status options + +``` + +For more information, see [here](io-cli.md#status-1). + + + + +* Get the current status of **all** sink connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName/status|operation/getSinkStatus?version=@pulsar:version_number@} + +* Gets the current status of a **specified** sink connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSinkInstanceStatus?version=@pulsar:version_number@} + + + + +* Get the current status of **all** sink connectors. + + ```java + + SinkStatus getSinkStatus(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + + SinkStatus.SinkInstanceStatus.SinkInstanceStatusData getSinkStatus(String tenant, + String namespace, + String sink, + int id) + throws PulsarAdminException + + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Sink instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatusWithInstanceID`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Update a connector + +### `update` + +You can update a running connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Update a running Pulsar source connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sources update options + +``` + +For more information, see [here](io-cli.md#update). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/updateSource?version=@pulsar:version_number@} + + + + +* Update a running source connector with a **local file**. + + ```java + + void updateSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSource-SourceConfig-java.lang.String-). + +* Update a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sourceConfig` | The source configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + +For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSourceWithUrl-SourceConfig-java.lang.String-). + + + + +```` + +#### Sink + +Update a running Pulsar sink connector. + +````mdx-code-block + + + + +Use the `update` subcommand. + +``` + +$ pulsar-admin sinks update options + +``` + +For more information, see [here](io-cli.md#update-1). + + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/updateSink?version=@pulsar:version_number@} + + + + +* Update a running sink connector with a **local file**. + + ```java + + void updateSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSink-SinkConfig-java.lang.String-). + +* Update a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + + void updateSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sinkConfig` | The sink configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + |`PulsarAdminException.NotFoundException` | Cluster doesn't exist + |`PulsarAdminException` | Unexpected error + +For more information, see [`updateSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSinkWithUrl-SinkConfig-java.lang.String-). + + + + +```` + +## Stop a connector + +### `stop` + +You can stop a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Stop a source connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sources stop options + +``` + +For more information, see [here](io-cli.md#stop). + + + + +* Stop **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/stopSource?version=@pulsar:version_number@} + +* Stop a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId|operation/stopSource?version=@pulsar:version_number@} + + + + +* Stop **all** source connectors. + + ```java + + void stopSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** source connector. + + ```java + + void stopSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Stop a sink connector. + +````mdx-code-block + + + + +Use the `stop` subcommand. + +``` + +$ pulsar-admin sinks stop options + +``` + +For more information, see [here](io-cli.md#stop-1). + + + + +* Stop **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName/stop|operation/stopSink?version=@pulsar:version_number@} + +* Stop a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkeName/:instanceId/stop|operation/stopSink?version=@pulsar:version_number@} + + + + +* Stop **all** sink connectors. + + ```java + + void stopSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** sink connector. + + ```java + + void stopSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Restart a connector + +### `restart` + +You can restart a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Restart a source connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sources restart options + +``` + +For more information, see [here](io-cli.md#restart). + + + + +* Restart **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart **all** source connectors. + + ```java + + void restartSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** source connector. + + ```java + + void restartSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +#### Sink + +Restart a sink connector. + +````mdx-code-block + + + + +Use the `restart` subcommand. + +``` + +$ pulsar-admin sinks restart options + +``` + +For more information, see [here](io-cli.md#restart-1). + + + + +* Restart **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/restart|operation/restartSource?version=@pulsar:version_number@} + +* Restart a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/:instanceId/restart|operation/restartSource?version=@pulsar:version_number@} + + + + +* Restart all Pulsar sink connectors. + + ```java + + void restartSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Sink name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** sink connector. + + ```java + + void restartSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Sink instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + + +```` + +## Delete a connector + +### `delete` + +You can delete a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Delete a source connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sources delete options + +``` + +For more information, see [here](io-cli.md#delete). + + + + +Delete al Pulsar source connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/deregisterSource?version=@pulsar:version_number@} + + + + +Delete a source connector. + +```java + +void deleteSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`source` | Source name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#deleteSource-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` + +#### Sink + +Delete a sink connector. + +````mdx-code-block + + + + +Use the `delete` subcommand. + +``` + +$ pulsar-admin sinks delete options + +``` + +For more information, see [here](io-cli.md#delete-1). + + + + +Delete a sink connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/deregisterSink?version=@pulsar:version_number@} + + + + +Delete a Pulsar sink connector. + +```java + +void deleteSink(String tenant, + String namespace, + String source) + throws PulsarAdminException + +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#deleteSink-java.lang.String-java.lang.String-java.lang.String-). + + + + +```` diff --git a/site2/website/versioned_docs/version-2.9.x/kubernetes-helm.md b/site2/website/versioned_docs/version-2.9.x/kubernetes-helm.md new file mode 100644 index 0000000000000..ea92a0968cd7d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/kubernetes-helm.md @@ -0,0 +1,441 @@ +--- +id: kubernetes-helm +title: Get started in Kubernetes +sidebar_label: "Run Pulsar in Kubernetes" +original_id: kubernetes-helm +--- + +This section guides you through every step of installing and running Apache Pulsar with Helm on Kubernetes quickly, including the following sections: + +- Install the Apache Pulsar on Kubernetes using Helm +- Start and stop Apache Pulsar +- Create topics using `pulsar-admin` +- Produce and consume messages using Pulsar clients +- Monitor Apache Pulsar status with Prometheus and Grafana + +For deploying a Pulsar cluster for production usage, read the documentation on [how to configure and install a Pulsar Helm chart](helm-deploy.md). + +## Prerequisite + +- Kubernetes server 1.14.0+ +- kubectl 1.14.0+ +- Helm 3.0+ + +:::tip + +For the following steps, step 2 and step 3 are for **developers** and step 4 and step 5 are for **administrators**. + +::: + +## Step 0: Prepare a Kubernetes cluster + +Before installing a Pulsar Helm chart, you have to create a Kubernetes cluster. You can follow [the instructions](helm-prepare.md) to prepare a Kubernetes cluster. + +We use [Minikube](https://minikube.sigs.k8s.io/docs/start/) in this quick start guide. To prepare a Kubernetes cluster, follow these steps: + +1. Create a Kubernetes cluster on Minikube. + + ```bash + + minikube start --memory=8192 --cpus=4 --kubernetes-version= + + ``` + + The `` can be any [Kubernetes version supported by your Minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/), such as `v1.16.1`. + +2. Set `kubectl` to use Minikube. + + ```bash + + kubectl config use-context minikube + + ``` + +3. To use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with the local Kubernetes cluster on Minikube, enter the command below: + + ```bash + + minikube dashboard + + ``` + + The command automatically triggers opening a webpage in your browser. + +## Step 1: Install Pulsar Helm chart + +1. Add Pulsar charts repo. + + ```bash + + helm repo add apache https://pulsar.apache.org/charts + + ``` + + ```bash + + helm repo update + + ``` + +2. Clone the Pulsar Helm chart repository. + + ```bash + + git clone https://github.com/apache/pulsar-helm-chart + cd pulsar-helm-chart + + ``` + +3. Run the script `prepare_helm_release.sh` to create secrets required for installing the Apache Pulsar Helm chart. The username `pulsar` and password `pulsar` are used for logging into the Grafana dashboard and Pulsar Manager. + + ```bash + + ./scripts/pulsar/prepare_helm_release.sh \ + -n pulsar \ + -k pulsar-mini \ + -c + + ``` + +4. Use the Pulsar Helm chart to install a Pulsar cluster to Kubernetes. + + :::note + + You need to specify `--set initialize=true` when installing Pulsar the first time. This command installs and starts Apache Pulsar. + + ::: + + ```bash + + helm install \ + --values examples/values-minikube.yaml \ + --set initialize=true \ + --namespace pulsar \ + pulsar-mini apache/pulsar + + ``` + +5. Check the status of all pods. + + ```bash + + kubectl get pods -n pulsar + + ``` + + If all pods start up successfully, you can see that the `STATUS` is changed to `Running` or `Completed`. + + **Output** + + ```bash + + NAME READY STATUS RESTARTS AGE + pulsar-mini-bookie-0 1/1 Running 0 9m27s + pulsar-mini-bookie-init-5gphs 0/1 Completed 0 9m27s + pulsar-mini-broker-0 1/1 Running 0 9m27s + pulsar-mini-grafana-6b7bcc64c7-4tkxd 1/1 Running 0 9m27s + pulsar-mini-prometheus-5fcf5dd84c-w8mgz 1/1 Running 0 9m27s + pulsar-mini-proxy-0 1/1 Running 0 9m27s + pulsar-mini-pulsar-init-t7cqt 0/1 Completed 0 9m27s + pulsar-mini-pulsar-manager-9bcbb4d9f-htpcs 1/1 Running 0 9m27s + pulsar-mini-toolset-0 1/1 Running 0 9m27s + pulsar-mini-zookeeper-0 1/1 Running 0 9m27s + + ``` + +6. Check the status of all services in the namespace `pulsar`. + + ```bash + + kubectl get services -n pulsar + + ``` + + **Output** + + ```bash + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + pulsar-mini-bookie ClusterIP None 3181/TCP,8000/TCP 11m + pulsar-mini-broker ClusterIP None 8080/TCP,6650/TCP 11m + pulsar-mini-grafana LoadBalancer 10.106.141.246 3000:31905/TCP 11m + pulsar-mini-prometheus ClusterIP None 9090/TCP 11m + pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 11m + pulsar-mini-pulsar-manager LoadBalancer 10.103.192.175 9527:30190/TCP 11m + pulsar-mini-toolset ClusterIP None 11m + pulsar-mini-zookeeper ClusterIP None 2888/TCP,3888/TCP,2181/TCP 11m + + ``` + +## Step 2: Use pulsar-admin to create Pulsar tenants/namespaces/topics + +`pulsar-admin` is the CLI (command-Line Interface) tool for Pulsar. In this step, you can use `pulsar-admin` to create resources, including tenants, namespaces, and topics. + +1. Enter the `toolset` container. + + ```bash + + kubectl exec -it -n pulsar pulsar-mini-toolset-0 -- /bin/bash + + ``` + +2. In the `toolset` container, create a tenant named `apache`. + + ```bash + + bin/pulsar-admin tenants create apache + + ``` + + Then you can list the tenants to see if the tenant is created successfully. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + You should see a similar output as below. The tenant `apache` has been successfully created. + + ```bash + + "apache" + "public" + "pulsar" + + ``` + +3. In the `toolset` container, create a namespace named `pulsar` in the tenant `apache`. + + ```bash + + bin/pulsar-admin namespaces create apache/pulsar + + ``` + + Then you can list the namespaces of tenant `apache` to see if the namespace is created successfully. + + ```bash + + bin/pulsar-admin namespaces list apache + + ``` + + You should see a similar output as below. The namespace `apache/pulsar` has been successfully created. + + ```bash + + "apache/pulsar" + + ``` + +4. In the `toolset` container, create a topic `test-topic` with `4` partitions in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics create-partitioned-topic apache/pulsar/test-topic -p 4 + + ``` + +5. In the `toolset` container, list all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + bin/pulsar-admin topics list-partitioned-topics apache/pulsar + + ``` + + Then you can see all the partitioned topics in the namespace `apache/pulsar`. + + ```bash + + "persistent://apache/pulsar/test-topic" + + ``` + +## Step 3: Use Pulsar client to produce and consume messages + +You can use the Pulsar client to create producers and consumers to produce and consume messages. + +By default, the Pulsar Helm chart exposes the Pulsar cluster through a Kubernetes `LoadBalancer`. In Minikube, you can use the following command to check the proxy service. + +```bash + +kubectl get services -n pulsar | grep pulsar-mini-proxy + +``` + +You will see a similar output as below. + +```bash + +pulsar-mini-proxy LoadBalancer 10.97.240.109 80:32305/TCP,6650:31816/TCP 28m + +``` + +This output tells what are the node ports that Pulsar cluster's binary port and HTTP port are mapped to. The port after `80:` is the HTTP port while the port after `6650:` is the binary port. + +Then you can find the IP address and exposed ports of your Minikube server by running the following command. + +```bash + +minikube service pulsar-mini-proxy -n pulsar + +``` + +**Output** + +```bash + +|-----------|-------------------|-------------|-------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|-------------------------| +| pulsar | pulsar-mini-proxy | http/80 | http://172.17.0.4:32305 | +| | | pulsar/6650 | http://172.17.0.4:31816 | +|-----------|-------------------|-------------|-------------------------| +🏃 Starting tunnel for service pulsar-mini-proxy. +|-----------|-------------------|-------------|------------------------| +| NAMESPACE | NAME | TARGET PORT | URL | +|-----------|-------------------|-------------|------------------------| +| pulsar | pulsar-mini-proxy | | http://127.0.0.1:61853 | +| | | | http://127.0.0.1:61854 | +|-----------|-------------------|-------------|------------------------| + +``` + +At this point, you can get the service URLs to connect to your Pulsar client. Here are URL examples: + +``` + +webServiceUrl=http://127.0.0.1:61853/ +brokerServiceUrl=pulsar://127.0.0.1:61854/ + +``` + +Then you can proceed with the following steps: + +1. Download the Apache Pulsar tarball from the [downloads page](https://pulsar.apache.org/download/). + +2. Decompress the tarball based on your download file. + + ```bash + + tar -xf .tar.gz + + ``` + +3. Expose `PULSAR_HOME`. + + (1) Enter the directory of the decompressed download file. + + (2) Expose `PULSAR_HOME` as the environment variable. + + ```bash + + export PULSAR_HOME=$(pwd) + + ``` + +4. Configure the Pulsar client. + + In the `${PULSAR_HOME}/conf/client.conf` file, replace `webServiceUrl` and `brokerServiceUrl` with the service URLs you get from the above steps. + +5. Create a subscription to consume messages from `apache/pulsar/test-topic`. + + ```bash + + bin/pulsar-client consume -s sub apache/pulsar/test-topic -n 0 + + ``` + +6. Open a new terminal. In the new terminal, create a producer and send 10 messages to the `test-topic` topic. + + ```bash + + bin/pulsar-client produce apache/pulsar/test-topic -m "---------hello apache pulsar-------" -n 10 + + ``` + +7. Verify the results. + + - From the producer side + + **Output** + + The messages have been produced successfully. + + ```bash + + 18:15:15.489 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 10 messages successfully produced + + ``` + + - From the consumer side + + **Output** + + At the same time, you can receive the messages as below. + + ```bash + + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + ----- got message ----- + ---------hello apache pulsar------- + + ``` + +## Step 4: Use Pulsar Manager to manage the cluster + +[Pulsar Manager](administration-pulsar-manager.md) is a web-based GUI management tool for managing and monitoring Pulsar. + +1. By default, the `Pulsar Manager` is exposed as a separate `LoadBalancer`. You can open the Pulsar Manager UI using the following command: + + ```bash + + minikube service -n pulsar pulsar-mini-pulsar-manager + + ``` + +2. The Pulsar Manager UI will be open in your browser. You can use the username `pulsar` and password `pulsar` to log into Pulsar Manager. + +3. In Pulsar Manager UI, you can create an environment. + + - Click `New Environment` button in the top-left corner. + - Type `pulsar-mini` for the field `Environment Name` in the popup window. + - Type `http://pulsar-mini-broker:8080` for the field `Service URL` in the popup window. + - Click `Confirm` button in the popup window. + +4. After successfully creating an environment, you are redirected to the `tenants` page of that environment. Then you can create `tenants`, `namespaces` and `topics` using the Pulsar Manager. + +## Step 5: Use Prometheus and Grafana to monitor cluster + +Grafana is an open-source visualization tool, which can be used for visualizing time series data into dashboards. + +1. By default, the Grafana is exposed as a separate `LoadBalancer`. You can open the Grafana UI using the following command: + + ```bash + + minikube service pulsar-mini-grafana -n pulsar + + ``` + +2. The Grafana UI is open in your browser. You can use the username `pulsar` and password `pulsar` to log into the Grafana Dashboard. + +3. You can view dashboards for different components of a Pulsar cluster. diff --git a/site2/website/versioned_docs/version-2.9.x/performance-pulsar-perf.md b/site2/website/versioned_docs/version-2.9.x/performance-pulsar-perf.md new file mode 100644 index 0000000000000..7f45498604536 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/performance-pulsar-perf.md @@ -0,0 +1,229 @@ +--- +id: performance-pulsar-perf +title: Pulsar Perf +sidebar_label: "Pulsar Perf" +original_id: performance-pulsar-perf +--- + +The Pulsar Perf is a built-in performance test tool for Apache Pulsar. You can use the Pulsar Perf to test message writing or reading performance. For detailed information about performance tuning, see [here](https://streamnative.io/en/blog/tech/2021-01-14-pulsar-architecture-performance-tuning). + +## Produce messages + +This example shows how the Pulsar Perf produces messages with default options. For all configuration options available for the `pulsar-perf produce` command, see [configuration options](#configuration-options-for-pulsar-perf-produce). + +``` + +bin/pulsar-perf produce my-topic + +``` + +After the command is executed, the test data is continuously output on the Console. + +**Output** + +``` + +19:53:31.459 [pulsar-perf-producer-exec-1-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Created 1 producers +19:53:31.482 [pulsar-timer-5-1] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider +19:53:40.861 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 93.7 msg/s --- 0.7 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.575 ms - med: 3.460 - 95pct: 4.790 - 99pct: 5.308 - 99.9pct: 5.834 - 99.99pct: 6.609 - Max: 6.609 +19:53:50.909 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.437 ms - med: 3.328 - 95pct: 4.656 - 99pct: 5.071 - 99.9pct: 5.519 - 99.99pct: 5.588 - Max: 5.588 +19:54:00.926 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.376 ms - med: 3.276 - 95pct: 4.520 - 99pct: 4.939 - 99.9pct: 5.440 - 99.99pct: 5.490 - Max: 5.490 +19:54:10.940 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.298 ms - med: 3.220 - 95pct: 4.474 - 99pct: 4.926 - 99.9pct: 5.645 - 99.99pct: 5.654 - Max: 5.654 +19:54:20.956 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.1 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.308 ms - med: 3.199 - 95pct: 4.532 - 99pct: 4.871 - 99.9pct: 5.291 - 99.99pct: 5.323 - Max: 5.323 +19:54:30.972 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.249 ms - med: 3.144 - 95pct: 4.437 - 99pct: 4.970 - 99.9pct: 5.329 - 99.99pct: 5.414 - Max: 5.414 +19:54:40.987 [main] INFO org.apache.pulsar.testclient.PerformanceProducer - Throughput produced: 100.0 msg/s --- 0.8 Mbit/s --- failure 0.0 msg/s --- Latency: mean: 3.435 ms - med: 3.361 - 95pct: 4.772 - 99pct: 5.150 - 99.9pct: 5.373 - 99.99pct: 5.837 - Max: 5.837 +^C19:54:44.325 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated throughput stats --- 7286 records sent --- 99.140 msg/s --- 0.775 Mbit/s +19:54:44.336 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceProducer - Aggregated latency stats --- Latency: mean: 3.383 ms - med: 3.293 - 95pct: 4.610 - 99pct: 5.059 - 99.9pct: 5.588 - 99.99pct: 5.837 - 99.999pct: 6.609 - Max: 6.609 + +``` + +From the above test data, you can get the throughput statistics and the write latency statistics. The aggregated statistics is printed when the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. If you specify a filename with the `--histogram-file` parameter, a file with the [HdrHistogram](http://hdrhistogram.github.io/HdrHistogram/) formatted test result appears under your directory after Pulsar Perf is stopped. You can also check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). For details about how to check the test result through [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html), see [HdrHistogram Plotter](#hdrhistogram-plotter). + +### Configuration options for `pulsar-perf produce` + +You can get all options by executing the `bin/pulsar-perf produce -h` command. Therefore, you can modify these options as required. + +The following table lists configuration options available for the `pulsar-perf produce` command. + +| Option | Description | Default value| +|----|----|----| +| access-mode | Set the producer access mode. Valid values are `Shared`, `Exclusive` and `WaitForExclusive`. | Shared | +| admin-url | Set the Pulsar admin URL. | N/A | +| auth-params | Set the authentication parameters, whose format is determined by the implementation of the `configure` method in the authentication plugin class, such as "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}". | N/A | +| auth-plugin | Set the authentication plugin class name. | N/A | +| listener-name | Set the listener name for the broker. | N/A | +| batch-max-bytes | Set the maximum number of bytes for each batch. | 4194304 | +| batch-max-messages | Set the maximum number of messages for each batch. | 1000 | +| batch-time-window | Set a window for a batch of messages. | 1 ms | +| busy-wait | Enable or disable Busy-Wait on the Pulsar client. | false | +| chunking | Configure whether to split the message and publish in chunks if message size is larger than allowed max size. | false | +| compression | Compress the message payload. | N/A | +| conf-file | Set the configuration file. | N/A | +| delay | Mark messages with a given delay. | 0s | +| encryption-key-name | Set the name of the public key used to encrypt the payload. | N/A | +| encryption-key-value-file | Set the file which contains the public key used to encrypt the payload. | N/A | +| exit-on-failure | Configure whether to exit from the process on publish failure. | false | +| format-class | Set the custom formatter class name. | org.apache.pulsar.testclient.DefaultMessageFormatter | +| format-payload | Configure whether to format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds. | false | +| help | Configure the help message. | false | +| histogram-file | HdrHistogram output file | N/A | +| max-connections | Set the maximum number of TCP connections to a single broker. | 100 | +| max-outstanding | Set the maximum number of outstanding messages. | 1000 | +| max-outstanding-across-partitions | Set the maximum number of outstanding messages across partitions. | 50000 | +| message-key-generation-mode | Set the generation mode of message key. Valid options are `autoIncrement`, `random`. | N/A | +| num-io-threads | Set the number of threads to be used for handling connections to brokers. | 1 | +| num-messages | Set the number of messages to be published in total. If it is set to 0, it keeps publishing messages. | 0 | +| num-producers | Set the number of producers for each topic. | 1 | +| num-test-threads | Set the number of test threads. | 1 | +| num-topic | Set the number of topics. | 1 | +| partitions | Configure whether to create partitioned topics with the given number of partitions. | N/A | +| payload-delimiter | Set the delimiter used to split lines when using payload from a file. | \n | +| payload-file | Use the payload from an UTF-8 encoded text file and a payload is randomly selected when messages are published. | N/A | +| producer-name | Set the producer name. | N/A | +| rate | Set the publish rate of messages across topics. | 100 | +| send-timeout | Set the sendTimeout. | 0 | +| separator | Set the separator between the topic and topic number. | - | +| service-url | Set the Pulsar service URL. | | +| size | Set the message size. | 1024 bytes | +| stats-interval-seconds | Set the statistics interval. If it is set to 0, statistics is disabled. | 0 | +| test-duration | Set the test duration. If it is set to 0, it keeps publishing tests. | 0s | +| trust-cert-file | Set the path for the trusted TLS certificate file. | | | +| warmup-time | Set the warm-up time. | 1s | +| tls-allow-insecure | Set the allowed insecure TLS connection. | N/A | + +## Consume messages + +This example shows how the Pulsar Perf consumes messages with default options. + +``` + +bin/pulsar-perf consume my-topic + +``` + +After the command is executed, the test data is continuously output on the Console. + +**Output** + +``` + +20:35:37.071 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Start receiving from 1 consumers on 1 topics +20:35:41.150 [pulsar-client-io-1-9] WARN com.scurrilous.circe.checksum.Crc32cIntChecksum - Failed to load Circe JNI library. Falling back to Java based CRC32c provider +20:35:47.092 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 59.572 msg/s -- 0.465 Mbit/s --- Latency: mean: 11.298 ms - med: 10 - 95pct: 15 - 99pct: 98 - 99.9pct: 137 - 99.99pct: 152 - Max: 152 +20:35:57.104 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.958 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.176 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 18 - Max: 18 +20:36:07.115 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.006 msg/s -- 0.781 Mbit/s --- Latency: mean: 9.316 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:17.125 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 100.085 msg/s -- 0.782 Mbit/s --- Latency: mean: 9.327 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:27.136 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.900 msg/s -- 0.780 Mbit/s --- Latency: mean: 9.404 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +20:36:37.147 [main] INFO org.apache.pulsar.testclient.PerformanceConsumer - Throughput received: 99.985 msg/s -- 0.781 Mbit/s --- Latency: mean: 8.998 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 17 - 99.99pct: 17 - Max: 17 +^C20:36:42.755 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated throughput stats --- 6051 records received --- 92.125 msg/s --- 0.720 Mbit/s +20:36:42.759 [Thread-1] INFO org.apache.pulsar.testclient.PerformanceConsumer - Aggregated latency stats --- Latency: mean: 9.422 ms - med: 9 - 95pct: 15 - 99pct: 16 - 99.9pct: 98 - 99.99pct: 137 - 99.999pct: 152 - Max: 152 + +``` + +From the output test data, you can get the throughput statistics and the end-to-end latency statistics. The aggregated statistics is printed after the Pulsar Perf is stopped. You can press **Ctrl**+**C** to stop the Pulsar Perf. + +### Configuration options for `pulsar-perf consume` + +You can get all options by executing the `bin/pulsar-perf consume -h` command. Therefore, you can modify these options as required. + +The following table lists configuration options available for the `pulsar-perf consume` command. + +| Option | Description | Default value | +|----|----|----| +| acks-delay-millis | Set the acknowledgment grouping delay in milliseconds. | 100 ms | +| auth-params | Set the authentication parameters, whose format is determined by the implementation of the `configure` method in the authentication plugin class, such as "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}". | N/A | +| auth-plugin | Set the authentication plugin class name. | N/A | +| auto_ack_chunk_q_full | Configure whether to automatically ack for the oldest message in receiver queue if the queue is full. | false | +| listener-name | Set the listener name for the broker. | N/A | +| batch-index-ack | Enable or disable the batch index acknowledgment. | false | +| busy-wait | Enable or disable Busy-Wait on the Pulsar client. | false | +| conf-file | Set the configuration file. | N/A | +| encryption-key-name | Set the name of the public key used to encrypt the payload. | N/A | +| encryption-key-value-file | Set the file which contains the public key used to encrypt the payload. | N/A | +| help | Configure the help message. | false | +| histogram-file | HdrHistogram output file | N/A | +| expire_time_incomplete_chunked_messages | Set the expiration time for incomplete chunk messages (in milliseconds). | 0 | +| max-connections | Set the maximum number of TCP connections to a single broker. | 100 | +| max_chunked_msg | Set the max pending chunk messages. | 0 | +| num-consumers | Set the number of consumers for each topic. | 1 | +| num-io-threads |Set the number of threads to be used for handling connections to brokers. | 1 | +| num-subscriptions | Set the number of subscriptions (per topic). | 1 | +| num-topic | Set the number of topics. | 1 | +| pool-messages | Configure whether to use the pooled message. | true | +| rate | Simulate a slow message consumer (rate in msg/s). | 0.0 | +| receiver-queue-size | Set the size of the receiver queue. | 1000 | +| receiver-queue-size-across-partitions | Set the max total size of the receiver queue across partitions. | 50000 | +| replicated | Configure whether the subscription status should be replicated. | false | +| service-url | Set the Pulsar service URL. | | +| stats-interval-seconds | Set the statistics interval. If it is set to 0, statistics is disabled. | 0 | +| subscriber-name | Set the subscriber name prefix. | | +| subscription-position | Set the subscription position. Valid values are `Latest`, `Earliest`.| Latest | +| subscription-type | Set the subscription type.
  • Exclusive
  • Shared
  • Failover
  • Key_Shared
  • | Exclusive | +| test-duration | Set the test duration (in seconds). If the value is 0 or smaller than 0, it keeps consuming messages. | 0 | +| tls-allow-insecure | Set the allowed insecure TLS connection. | N/A | +| trust-cert-file | Set the path for the trusted TLS certificate file. | | | + +## Configurations + +By default, the Pulsar Perf uses `conf/client.conf` as the default configuration and uses `conf/log4j2.yaml` as the default Log4j configuration. If you want to connect to other Pulsar clusters, you can update the `brokerServiceUrl` in the client configuration. + +You can use the following commands to change the configuration file and the Log4j configuration file. + +``` + +export PULSAR_CLIENT_CONF= +export PULSAR_LOG_CONF= + +``` + +In addition, you can use the following command to configure the JVM configuration through environment variables: + +``` + +export PULSAR_EXTRA_OPTS='-Xms4g -Xmx4g -XX:MaxDirectMemorySize=4g' + +``` + +## HdrHistogram Plotter + +The [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html) is a visualization tool for checking Pulsar Perf test results, which makes it easier to observe the test results. + +To check test results through the HdrHistogram Plotter, follow these steps: + +1. Clone the HdrHistogram repository from GitHub to the local. + + ``` + + git clone https://github.com/HdrHistogram/HdrHistogram.git + + ``` + +2. Switch to the HdrHistogram folder. + + ``` + + cd HdrHistogram + + ``` + +3. Install the HdrHistogram Plotter. + + ``` + + mvn clean install -DskipTests + + ``` + +4. Transform the file generated by the Pulsar Perf. + + ``` + + ./HistogramLogProcessor -i -o + + ``` + +5. You will get two output files. Upload the output file with the filename extension of .hgrm to the [HdrHistogram Plotter](https://hdrhistogram.github.io/HdrHistogram/plotFiles.html). + +6. Check the test result through the Graphical User Interface of the HdrHistogram Plotter, as shown blow. + + ![](/assets/perf-produce.png) diff --git a/site2/website/versioned_docs/version-2.9.x/reference-cli-tools.md b/site2/website/versioned_docs/version-2.9.x/reference-cli-tools.md new file mode 100644 index 0000000000000..3bcff3c4bede8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-cli-tools.md @@ -0,0 +1,942 @@ +--- +id: reference-cli-tools +title: Pulsar command-line tools +sidebar_label: "Pulsar CLI tools" +original_id: reference-cli-tools +--- + +Pulsar offers several command-line tools that you can use for managing Pulsar installations, performance testing, using command-line producers and consumers, and more. + +All Pulsar command-line tools can be run from the `bin` directory of your [installed Pulsar package](getting-started-standalone.md). The following tools are currently documented: + +* [`pulsar`](#pulsar) +* [`pulsar-client`](#pulsar-client) +* [`pulsar-daemon`](#pulsar-daemon) +* [`pulsar-perf`](#pulsar-perf) +* [`bookkeeper`](#bookkeeper) +* [`broker-tool`](#broker-tool) + +> ### Getting help +> You can get help for any CLI tool, command, or subcommand using the `--help` flag, or `-h` for short. Here's an example: + +> ```shell +> +> $ bin/pulsar broker --help +> +> +> ``` + + +## `pulsar` + +The pulsar tool is used to start Pulsar components, such as bookies and ZooKeeper, in the foreground. + +These processes can also be started in the background, using nohup, using the pulsar-daemon tool, which has the same command interface as pulsar. + +Usage: + +```bash + +$ pulsar command + +``` + +Commands: +* `bookie` +* `broker` +* `compact-topic` +* `configuration-store` +* `initialize-cluster-metadata` +* `proxy` +* `standalone` +* `websocket` +* `zookeeper` +* `zookeeper-shell` + +Example: + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +The table below lists the environment variables that you can use to configure the `pulsar` tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|`conf/log4j2.yaml`| +|`PULSAR_BROKER_CONF`|Configuration file for broker|`conf/broker.conf`| +|`PULSAR_BOOKKEEPER_CONF`|description: Configuration file for bookie|`conf/bookkeeper.conf`| +|`PULSAR_ZK_CONF`|Configuration file for zookeeper|`conf/zookeeper.conf`| +|`PULSAR_CONFIGURATION_STORE_CONF`|Configuration file for the configuration store|`conf/global_zookeeper.conf`| +|`PULSAR_WEBSOCKET_CONF`|Configuration file for websocket proxy|`conf/websocket.conf`| +|`PULSAR_STANDALONE_CONF`|Configuration file for standalone|`conf/standalone.conf`| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the jvm|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_PID_DIR`|Folder where the pulsar server PID file should be stored|| +|`PULSAR_STOP_TIMEOUT`|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + + +### `bookie` + +Starts up a bookie server + +Usage: + +```bash + +$ pulsar bookie options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-readOnly`|Force start a read-only bookie server|false| +|`-withAutoRecovery`|Start auto-recover service bookie server|false| + + +Example + +```bash + +$ PULSAR_BOOKKEEPER_CONF=/path/to/bookkeeper.conf pulsar bookie \ + -readOnly \ + -withAutoRecovery + +``` + +### `broker` + +Starts up a Pulsar broker + +Usage + +```bash + +$ pulsar broker options + +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-bc` , `--bookie-conf`|Configuration file for BookKeeper|| +|`-rb` , `--run-bookie`|Run a BookKeeper bookie on the same host as the Pulsar broker|false| +|`-ra` , `--run-bookie-autorecovery`|Run a BookKeeper autorecovery daemon on the same host as the Pulsar broker|false| + +Example + +```bash + +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker + +``` + +### `compact-topic` + +Run compaction against a Pulsar topic (in a new process) + +Usage + +```bash + +$ pulsar compact-topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-t` , `--topic`|The Pulsar topic that you would like to compact|| + +Example + +```bash + +$ pulsar compact-topic --topic topic-to-compact + +``` + +### `configuration-store` + +Starts up the Pulsar configuration store + +Usage + +```bash + +$ pulsar configuration-store + +``` + +Example + +```bash + +$ PULSAR_CONFIGURATION_STORE_CONF=/path/to/configuration_store.conf pulsar configuration-store + +``` + +### `initialize-cluster-metadata` + +One-time cluster metadata initialization + +Usage + +```bash + +$ pulsar initialize-cluster-metadata options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-ub` , `--broker-service-url`|The broker service URL for the new cluster|| +|`-tb` , `--broker-service-url-tls`|The broker service URL for the new cluster with TLS encryption|| +|`-c` , `--cluster`|Cluster name|| +|`-cs` , `--configuration-store`|The configuration store quorum connection string|| +|`--existing-bk-metadata-service-uri`|The metadata service URI of the existing BookKeeper cluster that you want to use|| +|`-h` , `--help`|Cluster name|false| +|`--initial-num-stream-storage-containers`|The number of storage containers of BookKeeper stream storage|16| +|`--initial-num-transaction-coordinators`|The number of transaction coordinators assigned in a cluster|16| +|`-uw` , `--web-service-url`|The web service URL for the new cluster|| +|`-tw` , `--web-service-url-tls`|The web service URL for the new cluster with TLS encryption|| +|`-zk` , `--zookeeper`|The local ZooKeeper quorum connection string|| +|`--zookeeper-session-timeout-ms`|The local ZooKeeper session timeout. The time unit is in millisecond(ms)|30000| + + +### `proxy` + +Manages the Pulsar proxy + +Usage + +```bash + +$ pulsar proxy options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--configuration-store`|Configuration store connection string|| +|`-zk` , `--zookeeper-servers`|Local ZooKeeper connection string|| + +Example + +```bash + +$ PULSAR_PROXY_CONF=/path/to/proxy.conf pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk2 \ + --configuration-store zk-0,zk-1,zk-2 + +``` + +### `standalone` + +Run a broker service with local bookies and local ZooKeeper + +Usage + +```bash + +$ pulsar standalone options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a` , `--advertised-address`|The standalone broker advertised address|| +|`--bookkeeper-dir`|Local bookies’ base data directory|data/standalone/bookeeper| +|`--bookkeeper-port`|Local bookies’ base port|3181| +|`--no-broker`|Only start ZooKeeper and BookKeeper services, not the broker|false| +|`--num-bookies`|The number of local bookies|1| +|`--only-broker`|Only start the Pulsar broker service (not ZooKeeper or BookKeeper)|| +|`--wipe-data`|Clean up previous ZooKeeper/BookKeeper data|| +|`--zookeeper-dir`|Local ZooKeeper’s data directory|data/standalone/zookeeper| +|`--zookeeper-port` |Local ZooKeeper’s port|2181| + +Example + +```bash + +$ PULSAR_STANDALONE_CONF=/path/to/standalone.conf pulsar standalone + +``` + +### `websocket` + +Usage + +```bash + +$ pulsar websocket + +``` + +Example + +```bash + +$ PULSAR_WEBSOCKET_CONF=/path/to/websocket.conf pulsar websocket + +``` + +### `zookeeper` + +Starts up a ZooKeeper cluster + +Usage + +```bash + +$ pulsar zookeeper + +``` + +Example + +```bash + +$ PULSAR_ZK_CONF=/path/to/zookeeper.conf pulsar zookeeper + +``` + +### `zookeeper-shell` + +Connects to a running ZooKeeper cluster using the ZooKeeper shell + +Usage + +```bash + +$ pulsar zookeeper-shell options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| +|`-server`|Configuration zk address, eg: `127.0.0.1:2181`|| + + + +## `pulsar-client` + +The pulsar-client tool + +Usage + +```bash + +$ pulsar-client command + +``` + +Commands +* `produce` +* `consume` + + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{\"key1\":\"val1\",\"key2\":\"val2\"}"|{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}| +|`--auth-plugin`|Authentication plugin class name|org.apache.pulsar.client.impl.auth.AuthenticationSasl| +|`--listener-name`|Listener name for the broker|| +|`--proxy-protocol`|Proxy protocol to select type of routing at proxy|| +|`--proxy-url`|Proxy-server URL to which to connect|| +|`--url`|Broker URL to which to connect|pulsar://localhost:6650/
    ws://localhost:8080 | +| `-v`, `--version` | Get the version of the Pulsar client +|`-h`, `--help`|Show this help + + +### `produce` +Send a message or messages to a specific broker and topic + +Usage + +```bash + +$ pulsar-client produce topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-f`, `--files`|Comma-separated file paths to send; either -m or -f must be specified|[]| +|`-m`, `--messages`|Comma-separated string of messages to send; either -m or -f must be specified|[]| +|`-n`, `--num-produce`|The number of times to send the message(s); the count of messages/files * num-produce should be below 1000|1| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| +|`-db`, `--disable-batching`|Disable batch sending of messages.
    **Note:** This flag is only available in 2.9.2 and later versions.|false| +|`-c`, `--chunking`|Split the message and publish in chunks if the message size is larger than the allowed max size|false| +|`-s`, `--separator`|Character to split messages string with.|","| +|`-k`, `--key`|Message key to add|key=value string, like k1=v1,k2=v2.| +|`-p`, `--properties`|Properties to add. If you want to add multiple properties, use the comma as the separator, e.g. `k1=v1,k2=v2`.| | +|`-ekn`, `--encryption-key-name`|The public key name to encrypt payload.| | +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | + + +### `consume` +Consume messages from a specific broker and topic + +Usage + +```bash + +$ pulsar-client consume topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--hex`|Display binary messages in hexadecimal format.|false| +|`-n`, `--num-messages`|Number of messages to consume, 0 means to consume forever.|1| +|`-r`, `--rate`|Rate (in messages per second) at which to consume; a value 0 means to consume messages as fast as possible|0.0| +|`--regex`|Indicate the topic name is a regex pattern|false| +|`-s`, `--subscription-name`|Subscription name|| +|`-t`, `--subscription-type`|The type of the subscription. Possible values: Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-p`, `--subscription-position`|The position of the subscription. Possible values: Latest, Earliest.|Latest| +|`-m`, `--subscription-mode`|Subscription mode.|Durable| +|`-q`, `--queue-size`|The size of consumer's receiver queue.|0| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages.|0| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full.|false| +|`--hide-content`|Do not print the message to the console.|false| +|`-st`, `--schema-type`|Set the schema type. Use `auto_consume` to dump AVRO and other structured data types. Possible values: bytes, auto_consume.|bytes| +|`-ekv`, `--encryption-key-value`|The URI of public key to encrypt payload. For example, `file:///path/to/public.key` or `data:application/x-pem-file;base64,*****`.| | +|`-pm`, `--pool-messages`|Use the pooled message.|true| + +## `pulsar-daemon` +A wrapper around the pulsar tool that’s used to start and stop processes, such as ZooKeeper, bookies, and Pulsar brokers, in the background using nohup. + +pulsar-daemon has a similar interface to the pulsar command but adds start and stop commands for various services. For a listing of those services, run pulsar-daemon to see the help output or see the documentation for the pulsar command. + +Usage + +```bash + +$ pulsar-daemon command + +``` + +Commands +* `start` +* `stop` + + +### `start` +Start a service in the background using nohup. + +Usage + +```bash + +$ pulsar-daemon start service + +``` + +### `stop` +Stop a service that’s already been started using start. + +Usage + +```bash + +$ pulsar-daemon stop service options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|-force|Stop the service forcefully if not stopped by normal shutdown.|false| + + + +## `pulsar-perf` +A tool for performance testing a Pulsar broker. + +Usage + +```bash + +$ pulsar-perf command + +``` + +Commands +* `consume` +* `produce` +* `read` +* `websocket-producer` +* `managed-ledger` +* `monitor-brokers` +* `simulation-client` +* `simulation-controller` +* `help` + +Environment variables + +The table below lists the environment variables that you can use to configure the pulsar-perf tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|conf/log4j2.yaml| +|`PULSAR_CLIENT_CONF`|Configuration file for the client|conf/client.conf| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the JVM|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| + + +### `consume` +Run a consumer + +Usage + +``` + +$ pulsar-perf consume options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`-ac`, `--auto_ack_chunk_q_full`|Auto ack for the oldest message in consumer's receiver queue if the queue full|false| +|`--listener-name`|Listener name for the broker|| +|`--acks-delay-millis`|Acknowledgements grouping delay in millis|100| +|`--batch-index-ack`|Enable or disable the batch index acknowledgment|false| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-v`, `--encryption-key-value-file`|The file which contains the private key to decrypt payload|| +|`-h`, `--help`|Help message|false| +|`--conf-file`|Configuration file|| +|`-m`, `--num-messages`|Number of messages to consume in total. If the value is equal to or smaller than 0, it keeps consuming messages.|0| +|`-e`, `--expire_time_incomplete_chunked_messages`|The expiration time for incomplete chunk messages (in milliseconds)|0| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-mc`, `--max_chunked_msg`|Max pending chunk messages|0| +|`-n`, `--num-consumers`|Number of consumers (per topic)|1| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-ns`, `--num-subscriptions`|Number of subscriptions (per topic)|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-pm`, `--pool-messages`|Use the pooled message|true| +|`-r`, `--rate`|Simulate a slow message consumer (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-p`, `--receiver-queue-size-across-partitions`|Max total size of the receiver queue across partitions|50000| +|`--replicated`|Whether the subscription status should be replicated|false| +|`-u`, `--service-url`|Pulsar service URL|| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled|0| +|`-s`, `--subscriber-name`|Subscriber name prefix|| +|`-ss`, `--subscriptions`|A list of subscriptions to consume on (e.g. sub1,sub2)|sub| +|`-st`, `--subscription-type`|Subscriber type. Possible values are Exclusive, Shared, Failover, Key_Shared.|Exclusive| +|`-sp`, `--subscription-position`|Subscriber position. Possible values are Latest, Earliest.|Latest| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps consuming messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + + +### `produce` +Run a producer + +Usage + +```bash + +$ pulsar-perf produce options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-am`, `--access-mode`|Producer access mode. Valid values are `Shared`, `Exclusive` and `WaitForExclusive`|Shared| +|`-au`, `--admin-url`|Pulsar admin URL|| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-bb`, `--batch-max-bytes`|Maximum number of bytes per batch|4194304| +|`-bm`, `--batch-max-messages`|Maximum number of messages per batch|1000| +|`-bw`, `--busy-wait`|Enable or disable Busy-Wait on the Pulsar client|false| +|`-ch`, `--chunking`|Split the message and publish in chunks if the message size is larger than allowed max size|false| +|`-d`, `--delay`|Mark messages with a given delay in seconds|0s| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, ZLIB, ZSTD or SNAPPY.|| +|`--conf-file`|Configuration file|| +|`-k`, `--encryption-key-name`|The public key name to encrypt payload|| +|`-v`, `--encryption-key-value-file`|The file which contains the public key to encrypt payload|| +|`-ef`, `--exit-on-failure`|Exit from the process on publish failure|false| +|`-fc`, `--format-class`|Custom Formatter class name|org.apache.pulsar.testclient.DefaultMessageFormatter| +|`-fp`, `--format-payload`|Format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds|false| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-p`, `--max-outstanding-across-partitions`|Max number of outstanding messages across partitions|50000| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-mk`, `--message-key-generation-mode`|The generation mode of message key. Valid options are `autoIncrement`, `random`|| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-threads`, `--num-test-threads`|Number of test threads|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-np`, `--partitions`|Create partitioned topics with the given number of partitions. Setting this value to 0 means not trying to create a topic|| +|`-f`, `--payload-file`|Use payload from an UTF-8 encoded text file and a payload will be randomly selected when publishing messages|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-pn`, `--producer-name`|Producer Name|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`--send-timeout`|Set the sendTimeout|0| +|`--separator`|Separator between the topic and topic number|-| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--warmup-time`|Warm-up time in seconds|1| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + + +### `read` +Run a topic reader + +Usage + +```bash + +$ pulsar-perf read options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`--listener-name`|Listener name for the broker|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-n`, `--num-messages`|Number of messages to consume in total. If the value is equal to or smaller than 0, it keeps consuming messages.|0| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-ioThreads`, `--num-io-threads`|Set the number of threads to be used for handling connections to brokers|1| +|`-t`, `--num-topics`|The number of topics|1| +|`-r`, `--rate`|Simulate a slow message reader (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-u`, `--service-url`|Pulsar service URL|| +|`-m`, `--start-message-id`|Start message id. This can be either 'earliest', 'latest' or a specific message id by using 'lid:eid'|earliest| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps consuming messages.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--use-tls`|Use TLS encryption on the connection|false| +|`--tls-allow-insecure`|Allow insecure TLS connection|| + +### `websocket-producer` +Run a websocket producer + +Usage + +```bash + +$ pulsar-perf websocket-producer options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class. For example, `key1:val1,key2:val2` or `{"key1":"val1","key2":"val2"}`.|| +|`--auth-plugin`|Authentication plugin class name|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of empty buffer|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-fp`, `--format-payload`|Format %i as a message index in the stream from producer and/or %t as the timestamp nanoseconds|false| +|`-fc`, `--format-class`|Custom formatter class name|`org.apache.pulsar.testclient.DefaultMessageFormatter`| +|`-u`, `--proxy-url`|Pulsar Proxy URL, e.g., "ws://localhost:8080/"|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| + + +### `managed-ledger` +Write directly on managed-ledgers + +Usage + +```bash + +$ pulsar-perf managed-ledger options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-a`, `--ack-quorum`|Ledger ack quorum|1| +|`-dt`, `--digest-type`|BookKeeper digest type. Possible Values: [CRC32, MAC, CRC32C, DUMMY]|CRC32C| +|`-e`, `--ensemble-size`|Ledger ensemble size|1| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single bookie|1| +|`-o`, `--max-outstanding`|Max number of outstanding requests|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If this value is less than or equal to 0, it keeps publishing messages.|0| +|`-t`, `--num-topic`|Number of managed ledgers|1| +|`-r`, `--rate`|Write rate msg/s across managed ledgers|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration (in seconds). If this value is less than or equal to 0, it keeps publishing messages.|0| +|`--threads`|Number of threads writing|1| +|`-w`, `--write-quorum`|Ledger write quorum|1| +|`-zk`, `--zookeeperServers`|ZooKeeper connection string|| + + +### `monitor-brokers` +Continuously receive broker data and/or load reports + +Usage + +```bash + +$ pulsar-perf monitor-brokers options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--connect-string`|A connection string for one or more ZooKeeper servers|| +|`-h`, `--help`|Help message|false| + + +### `simulation-client` +Run a simulation server acting as a Pulsar client. Uses the client configuration specified in `conf/client.conf`. + +Usage + +```bash + +$ pulsar-perf simulation-client options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--port`|Port to listen on for controller|0| +|`--service-url`|Pulsar Service URL|| +|`-h`, `--help`|Help message|false| + +### `simulation-controller` +Run a simulation controller to give commands to servers + +Usage + +```bash + +$ pulsar-perf simulation-controller options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--client-port`|The port that the clients are listening on|0| +|`--clients`|Comma-separated list of client hostnames|| +|`--cluster`|The cluster to test on|| +|`-h`, `--help`|Help message|false| + + +### `help` +This help message + +Usage + +```bash + +$ pulsar-perf help + +``` + +## `bookkeeper` +A tool for managing BookKeeper. + +Usage + +```bash + +$ bookkeeper command + +``` + +Commands +* `auto-recovery` +* `bookie` +* `localbookie` +* `upgrade` +* `shell` + + +Environment variables + +The table below lists the environment variables that you can use to configure the bookkeeper tool. + +|Variable|Description|Default| +|---|---|---| +|BOOKIE_LOG_CONF|Log4j configuration file|conf/log4j2.yaml| +|BOOKIE_CONF|BookKeeper configuration file|conf/bk_server.conf| +|BOOKIE_EXTRA_OPTS|Extra options to be passed to the JVM|| +|BOOKIE_EXTRA_CLASSPATH|Extra paths for BookKeeper's classpath|| +|ENTRY_FORMATTER_CLASS|The Java class used to format entries|| +|BOOKIE_PID_DIR|Folder where the BookKeeper server PID file should be stored|| +|BOOKIE_STOP_TIMEOUT|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + +### `auto-recovery` +Runs an auto-recovery service daemon + +Usage + +```bash + +$ bookkeeper auto-recovery options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| + + +### `bookie` +Starts up a BookKeeper server (aka bookie) + +Usage + +```bash + +$ bookkeeper bookie options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|-readOnly|Force start a read-only bookie server|false| +|-withAutoRecovery|Start auto-recovery service bookie server|false| + + +### `localbookie` +Runs a test ensemble of N bookies locally + +Usage + +```bash + +$ bookkeeper localbookie N + +``` + +### `upgrade` +Upgrade the bookie’s filesystem + +Usage + +```bash + +$ bookkeeper upgrade options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|`-u`, `--upgrade`|Upgrade the bookie’s directories|| + + +### `shell` +Run shell for admin commands. To see a full listing of those commands, run bookkeeper shell without an argument. + +Usage + +```bash + +$ bookkeeper shell + +``` + +Example + +```bash + +$ bookkeeper shell bookiesanity + +``` + +## `broker-tool` + +The `broker- tool` is used for operations on a specific broker. + +Usage + +```bash + +$ broker-tool command + +``` + +Commands +* `load-report` +* `help` + +Example +Two ways to get more information about a command as below: + +```bash + +$ broker-tool help command +$ broker-tool command --help + +``` + +### `load-report` + +Collect the load report of a specific broker. +The command is run on a broker, and used for troubleshooting why broker can’t collect right load report. + +Options + +|Flag|Description|Default| +|---|---|---| +|`-i`, `--interval`| Interval to collect load report, in milliseconds || +|`-h`, `--help`| Display help information || + diff --git a/site2/website/versioned_docs/version-2.9.x/reference-configuration.md b/site2/website/versioned_docs/version-2.9.x/reference-configuration.md new file mode 100644 index 0000000000000..dd0ee0274facd --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-configuration.md @@ -0,0 +1,813 @@ +--- +id: reference-configuration +title: Pulsar configuration +sidebar_label: "Pulsar configuration" +original_id: reference-configuration +--- + + + + +You can manage Pulsar configuration by configuration files in the [`conf`](https://github.com/apache/pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md). + +- [BookKeeper](#bookkeeper) +- [Broker](#broker) +- [Client](#client) +- [Log4j](#log4j) +- [Log4j shell](#log4j-shell) +- [Standalone](#standalone) +- [WebSocket](#websocket) +- [Pulsar proxy](#pulsar-proxy) +- [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (that is the interface used to establish its identity). By default, loopback interfaces are not allowed to work as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. By default, the bookie listens on all interfaces.|eth0| +|advertisedAddress|Configure a specific hostname or IP address that the bookie should use to advertise itself to clients. By default, the bookie advertises either its own IP address or hostname according to the `listeningInterface` and `useHostNameAsBookieID` settings.|N/A| +|allowMultipleDirsUnderSameDiskPartition|Configure the bookie to enable/disable multiple ledger/index/journal directories in the same filesystem disk partition.|false| +|minUsableSizeForIndexFileCreation|The minimum safe usable size available in index directory for bookie to create index files while replaying journal at the time of bookie starts in Readonly Mode (in bytes).|1073741824| +|journalDirectory|The directory where BookKeeper outputs its write-ahead log (WAL).|data/bookkeeper/journal| +|journalDirectories|Directories that BookKeeper outputs its write ahead log. Multiple directories are available, being separated by `,`. For example: `journalDirectories=/tmp/bk-journal1,/tmp/bk-journal2`. If `journalDirectories` is set, the bookies skip `journalDirectory` and use this setting directory.|/tmp/bk-journal| +|ledgerDirectories|The directory where BookKeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by `,`, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled. Note: should be greater than gcWaitTime. |3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled. Note: should be greater than gcWaitTime. |86400| +|readOnlyModeEnabled|If `readOnlyModeEnabled=true`, then on all full ledger disks, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown.|true| +|forceReadOnlyBookie|Whether the bookie is force started in read only mode.|false| +|persistBookieStatusEnabled|Persist the bookie status locally on the disks. So the bookies can keep their status upon restarts.|false| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction reads entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal files to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|openLedgerRereplicationGracePeriod | The grace period, in milliseconds, that the replication worker waits before fencing and replicating a ledger fragment that's still being written to upon bookie failure. | 30000 | +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|100| +|autoRecoveryDaemonEnabled|Whether the bookie itself can start auto-recovery service.|true| +|lostBookieRecoveryDelay|How long to wait, in seconds, before starting auto recovery of a lost bookie.|0| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|allowStorageExpansion|Allow the bookie storage to expand. Newly added ledger and index dirs must be empty.|false| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|zkRetryBackoffStartMs|The start time that the Zookeeper client backoff retries in milliseconds.|1000| +|zkRetryBackoffMaxMs|The maximum time that the Zookeeper client backoff retries in milliseconds.|10000| +|zkEnableSecurity|Set ACLs on every node written on ZooKeeper, allowing users to read and write BookKeeper metadata stored on ZooKeeper. In order to make ACLs work you need to setup ZooKeeper JAAS authentication. All the bookies and Client need to share the same user, and this is usually done using Kerberos authentication. See ZooKeeper documentation.|false| +|httpServerEnabled|The flag enables/disables starting the admin http server.|false| +|httpServerPort|The HTTP server port to listen on. By default, the value is `8080`. If you want to keep it consistent with the Prometheus stats provider, you can set it to `8000`.|8080 +|httpServerClass|The http server class.|org.apache.bookkeeper.http.vertx.VertxHttpServer| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|serverSockKeepalive|This setting is used to send keep-alive messages on connection-oriented sockets.|true| +|serverTcpLinger|The socket linger timeout on close. When enabled, a close or shutdown will not return until all queued messages for the socket have been successfully sent or the linger timeout has been reached. Otherwise, the call returns immediately and the closing is done in the background.|0| +|byteBufAllocatorSizeMax|The maximum buf size of the received ByteBuf allocator.|1048576| +|nettyMaxFrameSizeBytes|The maximum netty frame size in bytes. Any message received larger than this will be rejected.|5253120| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficient when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain better performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partitions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|sortedLedgerStorageEnabled|Whether sorted-ledger storage is enabled.|true| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|The number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|The number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|numHighPriorityWorkerThreads|The umber of threads that should be used for high priority requests (i.e. recovery reads and adds, and fencing).|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|maxPendingAddRequestsPerThread|The limited number of pending requests, which is used to avoid the executor queue to grow indefinitely when add workers threads are enabled.|10000| +|isForceGCAllowWhenNoSpace|Whether force compaction is allowed when the disk is full or almost full. Forcing GC could get some space back, but could also fill up the disk space more quickly. This is because new log files are created before GC, while old garbage log files are deleted after GC.|false| +|verifyMetadataOnGC|True if the bookie should double check `readMetadata` prior to GC.|false| +|flushEntrylogBytes|Entry log flush interval in bytes. Flushing in smaller chunks but more frequently reduces spikes in disk I/O. Flushing too frequently may also affect performance negatively.|268435456| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ip address for the registration.|false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for the bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`). If you set the `bookieId`, then the `useHostNameAsBookieID` does not take effect.

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|N/A| +|allowEphemeralPorts|Whether the bookie is allowed to use an ephemeral port (port 0) as its server port. By default, an ephemeral port is not allowed. Using an ephemeral port as the service port usually indicates a configuration error. However, in unit tests, using an ephemeral port will address port conflict problems and allow running tests in parallel.|false| +|enableLocalTransport|Whether the bookie is allowed to listen for the BookKeeper clients executed on the local JVM.|false| +|disableServerSocketBind|Whether the bookie is allowed to disable bind on network interfaces. This bookie will be available only to BookKeeper clients executed on the local JVM.|false| +|skipListArenaChunkSize|The number of bytes that we should use as chunk allocation for `org.apache.bookkeeper.bookie.SkipListArena`.|4194304| +|skipListArenaMaxAllocSize|The maximum size that we should allocate from the skiplist arena. Allocations larger than this should be allocated directly by the VM to avoid fragmentation.|131072| +|bookieAuthProviderFactoryClass|The factory class name of the bookie authentication provider. If this is null, then there is no authentication.|null| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log. For good performance, it should be big enough to hold a substantial amount of entries in the flush interval.|25% of direct memory| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens. By default, it is allocated to 25% of the available direct memory.|N/A| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases. By default, it uses 10% of direct memory.|N/A| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|advertisedListeners|Specify multiple advertised listeners for the broker.

    The format is `:pulsar://:`.

    If there are multiple listeners, separate them with commas.

    **Note**: do not use this configuration with `advertisedAddress` and `brokerServicePort`. If the value of this configuration is empty, the broker uses `advertisedAddress` and `brokerServicePort`|/| +|internalListenerName|Specify the internal listener name for the broker.

    **Note**: the listener name must be contained in `advertisedListeners`.

    If the value of this configuration is empty, the broker uses the first listener as the internal listener.|/| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|exposePublisherStats|Whether to enable topic level metrics.|true| +|statsUpdateFrequencyInSecs||60| +|statsUpdateInitialDelayInSecs||60| +|zookeeperServers| Zookeeper quorum connection string || +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|webSocketNumIoThreads|The number of IO threads in Pulsar Client used in WebSocket proxy.|8| +|webSocketConnectionsPerBroker|The number of connections per Broker in Pulsar Client used in WebSocket proxy.|8| +|webSocketSessionIdleTimeoutMillis|Time in milliseconds that idle WebSocket session times out.|300000| +|webSocketMaxTextFrameSize|The maximum size of a text message during parsing in WebSocket proxy.|1048576| +|exposeTopicLevelMetricsInPrometheus|Whether to enable topic level metrics.|true| +|exposeConsumerLevelMetricsInPrometheus|Whether to enable consumer level metrics.|false| +|jvmGCMetricsLoggerClassName|Classname of Pluggable JVM GC metrics logger that can log GC specific metrics.|N/A| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|bindAddresses| Additional Hostname or IP addresses the service binds on: `listener_name:scheme://host:port,...`. || +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|maxTenants|The maximum number of tenants that can be created in each Pulsar cluster. When the number of tenants reaches the threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +| maxNamespacesPerTenant | The maximum number of namespaces that can be created in each tenant. When the number of namespaces reaches this threshold, the broker rejects the request of creating a new tenant. The default value 0 disables the check. |0| +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationSnapshotIntervalSeconds| The time period after which a deduplication informational snapshot is taken. It runs simultaneously with `brokerDeduplicationEntriesInterval`. |120| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|dispatchThrottlingRatePerReplicatorInMsg| The default messages per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message dispatch-throttling| 0 | +|dispatchThrottlingRatePerReplicatorInByte| The default bytes per second dispatch throttling-limit for every replicator in replication. The value of `0` means disabling replication message-byte dispatch-throttling| 0 | +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitBytes| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. | -1 | +|backlogQuotaDefaultRetentionPolicy|The defaulted backlog quota retention policy. By Default, it is `producer_request_hold`.
  • 'producer_request_hold' Policy which holds producer's send request until the resource becomes available (or holding times out)
  • 'producer_exception' Policy which throws `javax.jms.ResourceAllocationException` to the producer
  • 'consumer_backlog_eviction' Policy which evicts the oldest message from the slowest consumer's backlog
  • |producer_request_hold| +|allowAutoTopicCreation| Enable topic auto creation if a new producer or consumer connected |true| +|allowAutoTopicCreationType| The type of topic that is allowed to be automatically created.(partitioned/non-partitioned) |non-partitioned| +|allowAutoSubscriptionCreation| Enable subscription auto creation if a new consumer connected |true| +|defaultNumPartitions| The number of partitioned topics that is allowed to be automatically created if `allowAutoTopicCreationType` is partitioned |1| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +| brokerDeleteInactiveTopicsMode | Set the mode to delete inactive topics.
  • `delete_when_no_subscriptions`: delete the topic which has no subscriptions or active producers.
  • `delete_when_subscriptions_caught_up`: delete the topic whose subscriptions have no backlogs and which has no active producers or consumers.
  • | `delete_when_no_subscriptions` | +| brokerDeleteInactiveTopicsMaxInactiveDurationSeconds | Set the maximum duration for inactive topics. If it is not specified, the `brokerDeleteInactiveTopicsFrequencySeconds` parameter is adopted. | N/A | +|forceDeleteTenantAllowed| Enable you to delete a tenant forcefully. |false| +|forceDeleteNamespaceAllowed| Enable you to delete a namespace forcefully. |false| +|messageExpiryCheckIntervalInMinutes| The frequency of proactively checking and purging expired messages. |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to determine whether topics with compaction policies need compaction. |60| +brokerServiceCompactionThresholdInBytes|If the estimated backlog size is greater than this threshold, compression is triggered.

    Set this threshold to 0 means disabling the compression check.|N/A +|delayedDeliveryEnabled| Whether to enable the delayed delivery for messages. If disabled, messages will be immediately delivered and there will be no tracking overhead.|true| +|delayedDeliveryTickTimeMillis|Control the tick time for retrying on delayed delivery, which affects the accuracy of the delivery time compared to the scheduled time. By default, it is 1 second.|1000| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|maxNumPartitionsPerPartitionedTopic|Max number of partitions per partitioned topic. Use 0 or negative number to disable the check|0| +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +|tlsEnabled|Deprecated - Use `webServicePortTls` and `brokerServicePortTls` instead. |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file. This cert is used to verify that any certs presented by connecting clients are signed by a certificate authority. If this verification fails, then the certs are untrusted and the connections are dropped. || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client. If it is set to `true`, a client with a cert which cannot be verified with the 'tlsTrustCertsFilePath' cert will be allowed to connect to the server, though the cert will not be used for client authentication. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tlsEnabledWithKeyStore| Enable TLS with KeyStore type configuration in broker |false| +|tlsProvider| TLS Provider for KeyStore type || +|tlsKeyStoreType| LS KeyStore type configuration in broker: JKS, PKCS12 |JKS| +|tlsKeyStore| TLS KeyStore path in broker || +|tlsKeyStorePassword| TLS KeyStore password for broker || +|brokerClientTlsEnabledWithKeyStore| Whether internal client use KeyStore type to authenticate with Pulsar brokers |false| +|brokerClientSslProvider| The TLS Provider used by internal client to authenticate with other Pulsar brokers || +|brokerClientTlsTrustStoreType| TLS TrustStore type configuration for internal client: JKS, PKCS12, used by the internal client to authenticate with Pulsar brokers |JKS| +|brokerClientTlsTrustStore| TLS TrustStore path for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsTrustStorePassword| TLS TrustStore password for internal client, used by the internal client to authenticate with Pulsar brokers || +|brokerClientTlsCiphers| Specify the tls cipher the internal client will use to negotiate during TLS Handshake. (a comma-separated list of ciphers) e.g. [TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256]|| +|brokerClientTlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS handshake. (a comma-separated list of protocol names). e.g. `TLSv1.3`, `TLSv1.2` || +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify which of the token's claims will be used as the authentication "principal" or "role". The default "sub" claim will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud", that will be used to get the audience from token. If not set, audience will not be verified. || +|tokenAudience| The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token, need contains this. || +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|subscriptionRedeliveryTrackerEnabled| Enable subscription message redelivery tracker |true| +|subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consuming.

    Setting this configuration to a value **greater than 0** deletes inactive subscriptions automatically.
    Setting this configuration to **0** does not delete inactive subscriptions automatically.

    Since this configuration takes effect on all topics, if there is even one topic whose subscriptions should not be deleted automatically, you need to set it to 0.
    Instead, you can set a subscription expiration time for each **namespace** using the [`pulsar-admin namespaces set-subscription-expiration-time options` command](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-subscription-expiration-time-em-). | 0 | +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |50000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names || +| authenticationRefreshCheckSeconds | Interval of time for checking for expired authentication credentials | 60 | +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|schemaRegistryStorageClassName|The schema storage implementation used by this broker.|org.apache.pulsar.broker.service.schema.BookkeeperSchemaStorageFactory| +|isSchemaValidationEnforced|Enforce schema validation on following cases: if a producer without a schema attempts to produce to a topic with schema, the producer will be failed to connect. PLEASE be carefully on using this, since non-java clients don't support schema. If this setting is enabled, then non-java clients fail to produce.|false| +| isAllowAutoUpdateSchemaEnabled | Allow schema to be auto-updated at broker level. You can override this by using 'is_allow_auto_update_schema' of namespace policy. Note that this configuration is only available in 2.9.2 and later versions. |true| +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +|offloadersDirectory|The directory for all the offloader implementations.|./offloaders| +|bookkeeperMetadataServiceUri| Metadata service uri that bookkeeper is used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: zk+hierarchical://localhost:2181/ledgers. The metadata service uri list can also be semicolon separated values like below: zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperNumberOfChannelsPerBookie| Number of channels per bookie |16| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientRegionawarePolicyEnabled| Enable region-aware bookie selection policy. BK will chose bookies from different regions and racks when forming a new bookie ensemble. If enabled, the value of bookkeeperClientRackawarePolicyEnabled is ignored |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| Minimum number of racks per write quorum. BK rack-aware bookie selection policy will try to get bookies from at least 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a write quorum. |2| +|bookkeeperClientEnforceMinNumRacksPerWriteQuorum| Enforces rack-aware bookie selection policy to pick bookies from 'bookkeeperClientMinNumRacksPerWriteQuorum' racks for a writeQuorum. If BK can't find bookie then it would throw BKNotEnoughBookiesException instead of picking random one. |false| +|bookkeeperClientReorderReadSequenceEnabled| Enable/disable reordering read sequence on reading entries. |false| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +|bookkeeperClientGetBookieInfoIntervalSeconds| Set the interval to periodically check bookie info |86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds| Set the interval to retry a failed bookie info lookup |60| +|bookkeeperEnableStickyReads | Enable/disable having read operations for a ledger to be sticky to a single bookie. If this flag is enabled, the client will use one single bookie (by preference) to read all entries for a ledger. | true | +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker. By default, uses 1/5th of available direct memory || +|managedLedgerCacheCopyEntries| Whether we should make a copy of the entry payloads when inserting in cache| false| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| The max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered after the min rollover time has passed and one of the following conditions is true:
    • The max rollover time has been reached
    • The max entries have been written to the ledger
    • The max ledger size has been written to the ledger
    |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more than once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quota |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages |false| +|brokerServicePurgeInactiveFrequencyInSeconds|Deprecated. Use `brokerDeleteInactiveTopicsFrequencySeconds`.|60| +|transactionCoordinatorEnabled|Whether to enable transaction coordinator in broker.|true| +|transactionMetadataStoreProviderClassName| |org.apache.pulsar.transaction.coordinator.impl.InMemTransactionMetadataStoreProvider| +|defaultRetentionTimeInMinutes| Default message retention time |0| +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|bootstrapNamespaces| The bootstrap name. | N/A | +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|supportedNamespaceBundleSplitAlgorithms| Supported algorithms name for namespace bundle split |[range_equally_divide,topic_count_equally_divide]| +|defaultNamespaceBundleSplitAlgorithm| Default algorithm name for namespace bundle split |range_equally_divide| +|managedLedgerOffloadDriver| The directory for all the offloader implementations `offloadersDirectory=./offloaders`. Driver to use to offload old data to long term storage (Possible values: S3, aws-s3, google-cloud-storage). When using google-cloud-storage, Make sure both Google Cloud Storage and Google Cloud Storage JSON API are enabled for the project (check from Developers Console -> Api&auth -> APIs). || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|managedLedgerOffloadPrefetchRounds|The maximum prefetch rounds for ledger reading for offloading.|1| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerOffloadDeletionLagMs|Delay between a ledger being successfully offloaded to long term storage and the ledger being deleted from bookkeeper | 14400000| +|managedLedgerOffloadAutoTriggerSizeThresholdBytes|The number of bytes before triggering automatic offload to long term storage |-1 (disabled)| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| +|gcsManagedLedgerOffloadRegion|For Google Cloud Storage ledger offload, region where offload bucket is located. Go to this page for more details: https://cloud.google.com/storage/docs/bucket-locations .|N/A| +|gcsManagedLedgerOffloadBucket|For Google Cloud Storage ledger offload, Bucket to place offloaded ledger into.|N/A| +|gcsManagedLedgerOffloadMaxBlockSizeInBytes|For Google Cloud Storage ledger offload, the maximum block size in bytes. (64MB by default, 5MB minimum)|67108864| +|gcsManagedLedgerOffloadReadBufferSizeInBytes|For Google Cloud Storage ledger offload, Read buffer size in bytes. (1MB by default)|1048576| +|gcsManagedLedgerOffloadServiceAccountKeyFile|For Google Cloud Storage, path to json file containing service account credentials. For more details, see the "Service Accounts" section of https://support.google.com/googleapi/answer/6158849 .|N/A| +|fileSystemProfilePath|For File System Storage, file system profile path.|../conf/filesystem_offload_core_site.xml| +|fileSystemURI|For File System Storage, file system uri.|N/A| +|s3ManagedLedgerOffloadRole| For Amazon S3 ledger offload, provide a role to assume before writing to s3 || +|s3ManagedLedgerOffloadRoleSessionName| For Amazon S3 ledger offload, provide a role session name when using a role |pulsar-s3-offload| +| acknowledgmentAtBatchIndexLevelEnabled | Enable or disable the batch index acknowledgement. | false | +|enableReplicatedSubscriptions|Whether to enable tracking of replicated subscriptions state across clusters.|true| +|replicatedSubscriptionsSnapshotFrequencyMillis|The frequency of snapshots for replicated subscriptions tracking.|1000| +|replicatedSubscriptionsSnapshotTimeoutSeconds|The timeout for building a consistent snapshot for tracking replicated subscriptions state.|30| +|replicatedSubscriptionsSnapshotMaxCachedPerSubscription|The maximum number of snapshot to be cached per subscription.|10| +|maxMessagePublishBufferSizeInMB|The maximum memory size for a broker to handle messages that are sent by producers. If the processing message size exceeds this value, the broker stops reading data from the connection. The processing messages refer to the messages that are sent to the broker but the broker has not sent response to the client. Usually the messages are waiting to be written to bookies. It is shared across all the topics running in the same broker. The value `-1` disables the memory limitation. By default, it is 50% of direct memory.|N/A| +|messagePublishBufferCheckIntervalInMillis|Interval between checks to see if message publish buffer size exceeds the maximum. Use `0` or negative number to disable the max publish buffer limiting.|100| +|retentionCheckIntervalInSeconds|Check between intervals to see if consumed ledgers need to be trimmed. Use 0 or negative number to disable the check.|120| +| maxMessageSize | Set the maximum size of a message. | 5242880 | +| preciseTopicPublishRateLimiterEnable | Enable precise topic publish rate limiting. | false | +| lazyCursorRecovery | Whether to recover cursors lazily when trying to recover a managed ledger backing a persistent topic. It can improve write availability of topics. The caveat is now when recovered ledger is ready to write we're not sure if all old consumers' last mark delete position(ack position) can be recovered or not. So user can make the trade off or have custom logic in application to checkpoint consumer state.| false | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +|subscriptionTypesEnabled| Enable all subscription types, which are exclusive, shared, failover, and key_shared. | Exclusive, Shared, Failover, Key_Shared | +| managedLedgerInfoCompressionType | Compression type of managed ledger information.

    Available options are `NONE`, `LZ4`, `ZLIB`, `ZSTD`, and `SNAPPY`).

    If this value is `NONE` or invalid, the `managedLedgerInfo` is not compressed.

    **Note** that after enabling this configuration, if you want to degrade a broker, you need to change the value to `NONE` and make sure all ledger metadata is saved without compression. | None | +| additionalServlets | Additional servlet name.

    If you have multiple additional servlets, separate them by commas.

    For example, additionalServlet_1, additionalServlet_2 | N/A | +| additionalServletDirectory | Location of broker additional servlet NAR directory | ./brokerAdditionalServlet | +|narExtractionDirectory | The extraction directory of the nar package.
    Available for Protocol Handler, Additional Servlets, Offloaders, Broker Interceptor. | System.getProperty("java.io.tmpdir") | + +#### Configuration override for clients internal to broker + +In 2.9.3 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients and Pulsar Admin Clients. These configurations are applied after hard coded configuration and before the above broker client configurations named above.| +|bookkeeper_| Configure the broker's BookKeeper clients used by managed ledgers and the BookkeeperPackagesStorage bookkeeper client. Takes precedence over most other configuration values.| + +:::note + +When running the function worker within the broker, these prefixed configurations do not apply to any of those clients. You must configure those clients using the `functions_worker.yml` file. + +::: + +## Client + +You can use the [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool to publish messages to and consume messages from Pulsar topics. You can use this tool in place of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether to enforce the TLS authentication in the cluster. |false| +| tlsAllowInsecureConnection | Allow TLS connections to servers whose certificate cannot be verified to have been signed by a trusted certificate authority. | false | +| tlsEnableHostnameVerification | Whether the server hostname must match the common name of the certificate that is used by the server. | false | +|tlsTrustCertsFilePath||| +| useKeyStoreTls | Enable TLS with KeyStore type configuration in the broker. | false | +| tlsTrustStoreType | TLS TrustStore type configuration.
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path. | | +| tlsTrustStorePassword | TLS TrustStore password. | | + + + + + + +## Log4j + +You can set the log level and configuration in the [log4j2.yaml](https://github.com/apache/pulsar/blob/d557e0aa286866363bc6261dec87790c055db1b0/conf/log4j2.yaml#L155) file. The following logging configuration parameters are available. + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + +:::note + +'topic' in log4j2.appender is configurable. +- If you want to append all logs to a single topic, set the same topic name. +- If you want to append logs to different topics, you can set different topic names. + +::: + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|authenticateOriginalAuthData| If this flag is set to `true`, the broker authenticates the original Auth data; else it just accepts the originalPrincipal and authorizes it (if required). |false| +|zookeeperServers| The quorum connection string for local ZooKeeper || +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| The port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|bindAddresses| Additional Hostname or IP addresses the service binds on: `listener_name:scheme://host:port,...`. || +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +| numAcceptorThreads | Number of threads to use for Netty Acceptor | 1 | +| numIOThreads | Number of threads to use for Netty IO | 2 * Runtime.getRuntime().availableProcessors() | +| numHttpServerThreads | Number of threads to use for HTTP requests processing | 2 * Runtime.getRuntime().availableProcessors()| +|isRunningStandalone|This flag controls features that are meant to be used when running in standalone mode.|N/A| +|clusterName| The name of the cluster that this broker belongs to. |standalone| +| failureDomainsEnabled | Enable cluster's failure-domain which can distribute brokers into logical region. | false | +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. |30000| +|zooKeeperOperationTimeoutSeconds|ZooKeeper operation timeout in seconds.|30| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |60000| +|skipBrokerShutdownOnOOM| Flag to skip broker shutdown when broker handles Out of memory error. |false| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitBytes| The default per-topic backlog quota limit. Being less than 0 means no limitation. By default, it is -1. |-1| +|ttlDurationDefaultInSeconds|The default Time to Live (TTL) for namespaces if the TTL is not configured at namespace policies. When the value is set to `0`, TTL is disabled. By default, TTL is disabled. |0| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. If topics are not consumed for some while, these inactive topics might be cleaned up. Deleting inactive topics is enabled by default. The default period is 1 minute.
    **Note:** When `brokerDeleteInactiveTopicsEnabled` is set to `true`, you need to ensure that `allowAutoTopicCreation` is also set to `true`. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +| maxPendingPublishRequestsPerConnection | Maximum pending publish requests per connection to avoid keeping large number of pending requests in memory | 1000| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +| subscriptionExpirationTimeMinutes | How long to delete inactive subscriptions from last consumption. When it is set to 0, inactive subscriptions are not deleted automatically | 0 | +| subscriptionRedeliveryTrackerEnabled | Enable subscription message redelivery tracker to send redelivery count to consumer. | true | +|subscriptionKeySharedEnable|Whether to enable the Key_Shared subscription.|true| +| subscriptionKeySharedUseConsistentHashing | In the Key_Shared subscription mode, with default AUTO_SPLIT mode, use splitting ranges or consistent hashing to reassign keys to new consumers. | false | +| subscriptionKeySharedConsistentHashingReplicaPoints | In the Key_Shared subscription mode, the number of points in the consistent-hashing ring. The greater the number, the more equal the assignment of keys to consumers. | 100 | +| subscriptionExpiryCheckIntervalInMinutes | How frequently to proactively check and purge expired subscription |5 | +| brokerDeduplicationEnabled | Set the default behavior for message deduplication in the broker. This can be overridden per-namespace. If it is enabled, the broker rejects messages that are already stored in the topic. | false | +| brokerDeduplicationMaxNumberOfProducers | Maximum number of producer information that it's going to be persisted for deduplication purposes | 10000 | +| brokerDeduplicationEntriesInterval | Number of entries after which a deduplication information snapshot is taken. A greater interval leads to less snapshots being taken though it would increase the topic recovery time, when the entries published after the snapshot need to be replayed. | 1000 | +| brokerDeduplicationProducerInactivityTimeoutMinutes | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | 360 | +| defaultNumberOfNamespaceBundles | When a namespace is created without specifying the number of bundles, this value is used as the default setting.| 4 | +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +| maxUnackedMessagesPerBroker | Maximum number of unacknowledged messages allowed per broker. Once this limit reaches, the broker stops dispatching messages to all shared subscriptions which has a higher number of unacknowledged messages until subscriptions start acknowledging messages back and unacknowledged messages count reaches to limit/2. When the value is set to 0, unacknowledged message limit check is disabled and broker does not block dispatchers. | 0 | +| maxUnackedMessagesPerSubscriptionOnBrokerBlocked | Once the broker reaches maxUnackedMessagesPerBroker limit, it blocks subscriptions which have higher unacknowledged messages than this percentage limit and subscription does not receive any new messages until that subscription acknowledges messages back. | 0.16 | +| unblockStuckSubscriptionEnabled|Broker periodically checks if subscription is stuck and unblock if flag is enabled.|false| +|zookeeperSessionExpiredPolicy|There are two policies when ZooKeeper session expired happens, "shutdown" and "reconnect". If it is set to "shutdown" policy, when ZooKeeper session expired happens, the broker is shutdown. If it is set to "reconnect" policy, the broker tries to reconnect to ZooKeeper server and re-register metadata to ZooKeeper. Note: the "reconnect" policy is an experiment feature.|shutdown| +| topicPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks topic publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. (Disable publish throttling with value 0) | 10| +| brokerPublisherThrottlingTickTimeMillis | Tick time to schedule task that checks broker publish rate limiting across all topics. A lower value can improve accuracy while throttling publish but it uses more CPU to perform frequent check. When the value is set to 0, publish throttling is disabled. |50 | +| brokerPublisherThrottlingMaxMessageRate | Maximum rate (in 1 second) of messages allowed to publish for a broker if the message rate limiting is enabled. When the value is set to 0, message rate limiting is disabled. | 0| +| brokerPublisherThrottlingMaxByteRate | Maximum rate (in 1 second) of bytes allowed to publish for a broker if the byte rate limiting is enabled. When the value is set to 0, the byte rate limiting is disabled. | 0 | +|subscribeThrottlingRatePerConsumer|Too many subscribe requests from a consumer can cause broker rewinding consumer cursors and loading data from bookies, hence causing high network bandwidth usage. When the positive value is set, broker will throttle the subscribe requests for one consumer. Otherwise, the throttling will be disabled. By default, throttling is disabled.|0| +|subscribeRatePeriodPerConsumerInSecond|Rate period for {subscribeThrottlingRatePerConsumer}. By default, it is 30s.|30| +| dispatchThrottlingRatePerTopicInMsg | Default messages (per second) dispatch throttling-limit for every topic. When the value is set to 0, default message dispatch throttling-limit is disabled. |0 | +| dispatchThrottlingRatePerTopicInByte | Default byte (per second) dispatch throttling-limit for every topic. When the value is set to 0, default byte dispatch throttling-limit is disabled. | 0| +| dispatchThrottlingRateRelativeToPublishRate | Enable dispatch rate-limiting relative to publish rate. | false | +|dispatchThrottlingRatePerSubscriptionInMsg|The defaulted number of message dispatching throttling-limit for a subscription. The value of 0 disables message dispatch-throttling.|0| +|dispatchThrottlingRatePerSubscriptionInByte|The default number of message-bytes dispatching throttling-limit for a subscription. The value of 0 disables message-byte dispatch-throttling.|0| +| dispatchThrottlingOnNonBacklogConsumerEnabled | Enable dispatch-throttling for both caught up consumers as well as consumers who have backlogs. | true | +|dispatcherMaxReadBatchSize|The maximum number of entries to read from BookKeeper. By default, it is 100 entries.|100| +|dispatcherMaxReadSizeBytes|The maximum size in bytes of entries to read from BookKeeper. By default, it is 5MB.|5242880| +|dispatcherMinReadBatchSize|The minimum number of entries to read from BookKeeper. By default, it is 1 entry. When there is an error occurred on reading entries from bookkeeper, the broker will backoff the batch size to this minimum number.|1| +|dispatcherMaxRoundRobinBatchSize|The maximum number of entries to dispatch for a shared subscription. By default, it is 20 entries.|20| +| preciseDispatcherFlowControl | Precise dispathcer flow control according to history message number of each entry. | false | +| streamingDispatch | Whether to use streaming read dispatcher. It can be useful when there's a huge backlog to drain and instead of read with micro batch we can streamline the read from bookkeeper to make the most of consumer capacity till we hit bookkeeper read limit or consumer process limit, then we can use consumer flow control to tune the speed. This feature is currently in preview and can be changed in subsequent release. | false | +| maxConcurrentLookupRequest | Maximum number of concurrent lookup request that the broker allows to throttle heavy incoming lookup traffic. | 50000 | +| maxConcurrentTopicLoadRequest | Maximum number of concurrent topic loading request that the broker allows to control the number of zk-operations. | 5000 | +| maxConcurrentNonPersistentMessagePerConnection | Maximum number of concurrent non-persistent message that can be processed per connection. | 1000 | +| numWorkerThreadsForNonPersistentTopic | Number of worker threads to serve non-persistent topic. | 8 | +| enablePersistentTopics | Enable broker to load persistent topics. | true | +| enableNonPersistentTopics | Enable broker to load non-persistent topics. | true | +| maxSubscriptionsPerTopic | Maximum number of subscriptions allowed to subscribe to a topic. Once this limit reaches, the broker rejects new subscriptions until the number of subscriptions decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxProducersPerTopic | Maximum number of producers allowed to connect to a topic. Once this limit reaches, the broker rejects new producers until the number of connected producers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerTopic | Maximum number of consumers allowed to connect to a topic. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxConsumersPerSubscription | Maximum number of consumers allowed to connect to a subscription. Once this limit reaches, the broker rejects new consumers until the number of connected consumers decreases. When the value is set to 0, the limit check is disabled. | 0 | +| maxNumPartitionsPerPartitionedTopic | Maximum number of partitions per partitioned topic. When the value is set to a negative number or is set to 0, the check is disabled. | 0 | +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. When the value is set to 0, check the TLS certificate on every new connection. | 300 | +| tlsCertificateFilePath | Path for the TLS certificate file. | | +| tlsKeyFilePath | Path for the TLS private key file. | | +| tlsTrustCertsFilePath | Path for the trusted TLS certificate file.| | +| tlsAllowInsecureConnection | Accept untrusted TLS certificate from the client. If it is set to true, a client with a certificate which cannot be verified with the 'tlsTrustCertsFilePath' certificate is allowed to connect to the server, though the certificate is not be used for client authentication. | false | +| tlsProtocols | Specify the TLS protocols the broker uses to negotiate during TLS handshake. | | +| tlsCiphers | Specify the TLS cipher the broker uses to negotiate during TLS Handshake. | | +| tlsRequireTrustedClientCertOnConnect | Trusted client certificates are required for to connect TLS. Reject the Connection if the client certificate is not trusted. In effect, this requires that all connecting clients perform TLS client authentication. | false | +| tlsEnabledWithKeyStore | Enable TLS with KeyStore type configuration in broker. | false | +| tlsProvider | TLS Provider for KeyStore type. | | +| tlsKeyStoreType | TLS KeyStore type configuration in the broker.
  • JKS
  • PKCS12
  • |JKS| +| tlsKeyStore | TLS KeyStore path in the broker. | | +| tlsKeyStorePassword | TLS KeyStore password for the broker. | | +| tlsTrustStoreType | TLS TrustStore type configuration in the broker
  • JKS
  • PKCS12
  • |JKS| +| tlsTrustStore | TLS TrustStore path in the broker. | | +| tlsTrustStorePassword | TLS TrustStore password for the broker. | | +| brokerClientTlsEnabledWithKeyStore | Configure whether the internal client uses the KeyStore type to authenticate with Pulsar brokers. | false | +| brokerClientSslProvider | The TLS Provider used by the internal client to authenticate with other Pulsar brokers. | | +| brokerClientTlsTrustStoreType | TLS TrustStore type configuration for the internal client to authenticate with Pulsar brokers.
  • JKS
  • PKCS12
  • | JKS | +| brokerClientTlsTrustStore | TLS TrustStore path for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsTrustStorePassword | TLS TrustStore password for the internal client to authenticate with Pulsar brokers. | | +| brokerClientTlsCiphers | Specify the TLS cipher that the internal client uses to negotiate during TLS Handshake. | | +| brokerClientTlsProtocols | Specify the TLS protocols that the broker uses to negotiate during TLS handshake. | | +| systemTopicEnabled | Enable/Disable system topics. | false | +| topicLevelPoliciesEnabled | Enable or disable topic level policies. Topic level policies depends on the system topic. Please enable the system topic first. | false | +| topicFencingTimeoutSeconds | If a topic remains fenced for a certain time period (in seconds), it is closed forcefully. If set to 0 or a negative number, the fenced topic is not closed. | 0 | +| proxyRoles | Role names that are treated as "proxy roles". If the broker sees a request with role as proxyRoles, it demands to see a valid original principal. | | +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +| authorizationProvider | Authorization provider fully qualified class-name. | org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider | +| authorizationAllowWildcardsMatching | Allow wildcard matching in authorization. Wildcard matching is applicable only when the wildcard-character (*) presents at the **first** or **last** position. | false | +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. | | +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. | | +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. | | +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. | | +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|saslJaasClientAllowedIds|This is a regexp, which limits the range of possible ids which can connect to the Broker using SASL. By default, it is set to `SaslConstants.JAAS_CLIENT_ALLOWED_IDS_DEFAULT`, which is ".*pulsar.*", so only clients whose id contains 'pulsar' are allowed to connect.|N/A| +|saslJaasBrokerSectionName|Service Principal, for login context name. By default, it is set to `SaslConstants.JAAS_DEFAULT_BROKER_SECTION_NAME`, which is "Broker".|N/A| +|httpMaxRequestSize|If the value is larger than 0, it rejects all HTTP requests with bodies larged than the configured limit.|-1| +|exposePreciseBacklogInPrometheus| Enable expose the precise backlog stats, set false to use published counter and consumed counter to calculate, this would be more efficient but may be inaccurate. |false| +|bookkeeperMetadataServiceUri|Metadata service uri is what BookKeeper used for loading corresponding metadata driver and resolving its metadata service location. This value can be fetched using `bookkeeper shell whatisinstanceid` command in BookKeeper cluster. For example: `zk+hierarchical://localhost:2181/ledgers`. The metadata service uri list can also be semicolon separated values like: `zk+hierarchical://zk1:2181;zk2:2181;zk3:2181/ledgers`.|N/A| +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientNumWorkerThreads| Number of BookKeeper client worker threads. Default is Runtime.getRuntime().availableProcessors() || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperUseV2WireProtocol|Use older Bookkeeper wire protocol with bookie.|true| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientGetBookieInfoIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|86400| +|bookkeeperClientGetBookieInfoRetryIntervalSeconds|Specify options for the GetBookieInfo check. This setting helps ensure the list of bookies that are up to date on the brokers.|60| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientRegionawarePolicyEnabled| |false| +|bookkeeperClientMinNumRacksPerWriteQuorum| |2| +|bookkeeperClientMinNumRacksPerWriteQuorum| |false| +|bookkeeperClientReorderReadSequenceEnabled| |false| +|bookkeeperClientIsolationGroups||| +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +| bookkeeperTLSProviderFactoryClass | Set the client security provider factory class name. | org.apache.bookkeeper.tls.TLSContextFactory | +| bookkeeperTLSClientAuthentication | Enable TLS authentication with bookie. | false | +| bookkeeperTLSKeyFileType | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSTrustCertTypes | Supported type: PEM, JKS, PKCS12. | PEM | +| bookkeeperTLSKeyStorePasswordPath | Path to file containing keystore password, if the client keystore is password protected. | | +| bookkeeperTLSTrustStorePasswordPath | Path to file containing truststore password, if the client truststore is password protected. | | +| bookkeeperTLSKeyFilePath | Path for the TLS private key file. | | +| bookkeeperTLSCertificateFilePath | Path for the TLS certificate file. | | +| bookkeeperTLSTrustCertsFilePath | Path for the trusted TLS certificate file. | | +| bookkeeperTlsCertFilesRefreshDurationSeconds | Tls cert refresh duration at bookKeeper-client in seconds (0 to disable check). | | +| bookkeeperDiskWeightBasedPlacementEnabled | Enable/Disable disk weight based placement. | false | +| bookkeeperExplicitLacIntervalInMills | Set the interval to check the need for sending an explicit LAC. When the value is set to 0, no explicit LAC is sent. | 0 | +| bookkeeperClientExposeStatsToPrometheus | Expose BookKeeper client managed ledger stats to Prometheus. | false | +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +| managedLedgerDigestType | Default type of checksum to use when writing to BookKeeper. | CRC32C | +| managedLedgerNumSchedulerThreads | Number of threads to be used for managed ledger scheduled tasks. | 8 | +|managedLedgerCacheSizeMB| |N/A| +|managedLedgerCacheCopyEntries| Whether to copy the entry payloads when inserting in cache.| false| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerUnackedRangesOpenCacheSetEnabled| Use Open Range-Set to cache unacknowledged messages |true| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +| managedLedgerMaxSizePerLedgerMbytes | Maximum ledger size before triggering a rollover for a topic. | 2048 | +| managedLedgerMaxUnackedRangesToPersist | Maximum number of "acknowledgment holes" that are going to be persistently stored. When acknowledging out of order, a consumer leaves holes that are supposed to be quickly filled by acknowledging all the messages. The information of which messages are acknowledged is persisted by compressing in "ranges" of messages that were acknowledged. After the max number of ranges is reached, the information is only tracked in memory and messages are redelivered in case of crashes. | 10000 | +| managedLedgerMaxUnackedRangesToPersistInZooKeeper | Maximum number of "acknowledgment holes" that can be stored in Zookeeper. If the number of unacknowledged message range is higher than this limit, the broker persists unacknowledged ranges into bookkeeper to avoid additional data overhead into Zookeeper. | 1000 | +|autoSkipNonRecoverableData| |false| +| managedLedgerMetadataOperationsTimeoutSeconds | Operation timeout while updating managed-ledger metadata. | 60 | +| managedLedgerReadEntryTimeoutSeconds | Read entries timeout when the broker tries to read messages from BookKeeper. | 0 | +| managedLedgerAddEntryTimeoutSeconds | Add entry timeout when the broker tries to publish message to BookKeeper. | 0 | +| managedLedgerNewEntriesCheckDelayInMillis | New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor tries to check again after the delay time. For consumption latency sensitive scenario, you can set the value to a smaller value or 0. Of course, a smaller value may degrade consumption throughput.|10 ms| +| managedLedgerPrometheusStatsLatencyRolloverSeconds | Managed ledger prometheus stats latency rollover seconds. | 60 | +| managedLedgerTraceTaskExecution | Whether to trace managed ledger task execution time. | true | +|managedLedgerNewEntriesCheckDelayInMillis|New entries check delay for the cursor under the managed ledger. If no new messages in the topic, the cursor will try to check again after the delay time. For consumption latency sensitive scenario, it can be set to a smaller value or 0. A smaller value degrades consumption throughput. By default, it is 10ms.|10| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +| loadBalancerAutoUnloadSplitBundlesEnabled | Enable/Disable automatic unloading of split bundles. | true | +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +| loadBalancerBrokerThresholdShedderPercentage | The broker resource usage threshold. When the broker resource usage is greater than the pulsar cluster average resource usage, the threshold shedder is triggered to offload bundles from the broker. It only takes effect in the ThresholdShedder strategy. | 10 | +| loadBalancerHistoryResourcePercentage | The history usage when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 0.9 | +| loadBalancerBandwithInResourceWeight | The BandWithIn usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBandwithOutResourceWeight | The BandWithOut usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerCPUResourceWeight | The CPU usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerMemoryResourceWeight | The heap memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerDirectMemoryResourceWeight | The direct memory usage weight when calculating new resource usage. It only takes effect in the ThresholdShedder strategy. | 1.0 | +| loadBalancerBundleUnloadMinThroughputThreshold | Bundle unload minimum throughput threshold. Avoid bundle unload frequently. It only takes effect in the ThresholdShedder strategy. | 10 | +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +| replicationPolicyCheckDurationSeconds | Duration to check replication policy to avoid replicator inconsistency due to missing ZooKeeper watch. When the value is set to 0, disable checking replication policy. | 600 | +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +|bookieId | If you want to custom a bookie ID or use a dynamic network address for a bookie, you can set the `bookieId`.

    Bookie advertises itself using the `bookieId` rather than the `BookieSocketAddress` (`hostname:port` or `IP:port`).

    The `bookieId` is a non-empty string that can contain ASCII digits and letters ([a-zA-Z9-0]), colons, dashes, and dots.

    For more information about `bookieId`, see [here](http://bookkeeper.apache.org/bps/BP-41-bookieid/).|/| +| maxTopicsPerNamespace | The maximum number of persistent topics that can be created in the namespace. When the number of topics reaches this threshold, the broker rejects the request of creating a new topic, including the auto-created topics by the producer or consumer, until the number of connected consumers decreases. The default value 0 disables the check. | 0 | +| isAllowAutoUpdateSchemaEnabled | Allow schema to be auto-updated at broker level. You can override this by using 'is_allow_auto_update_schema' of namespace policy. Note that this configuration is only available in 2.9.2 and later versions. |true| + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|configurationStoreServers ||| +|zooKeeperSessionTimeoutMillis| |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + +#### Configuration Override For Clients Internal to WebSocket + +In 2.9.3 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the broker's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +## Pulsar proxy + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|forwardAuthorizationCredentials| Forward client authorization credentials to Broker for re-authorization, and make sure authentication is enabled for this to take effect. |false| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +| brokerServiceURL | The service URL pointing to the broker cluster. Must begin with `pulsar://`. | | +| brokerServiceURLTLS | The TLS service URL pointing to the broker cluster. Must begin with `pulsar+ssl://`. | | +| brokerWebServiceURL | The Web service URL pointing to the broker cluster | | +| brokerWebServiceURLTLS | The TLS Web service URL pointing to the broker cluster | | +| functionWorkerWebServiceURL | The Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +| functionWorkerWebServiceURLTLS | The TLS Web service URL pointing to the function worker cluster. It is only configured when you setup function workers in a separate cluster. | | +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|zooKeeperCacheExpirySeconds|ZooKeeper cache expiry time in seconds|300| +|advertisedAddress|Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostname()` is used.|N/A| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +| proxyLogLevel | Proxy log level
  • 0: Do not log any TCP channel information.
  • 1: Parse and log any TCP channel information and command information without message body.
  • 2: Parse and log channel information, command information and message body.
  • | 0 | +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +| anonymousUserRole | When this parameter is not empty, unauthenticated users perform as anonymousUserRole. | | +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |50000| +|tlsEnabledInProxy| Deprecated - use `servicePortTls` and `webServicePortTls` instead. |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers. |false| +| tlsCertRefreshCheckDurationSec | TLS certificate refresh duration in seconds. If the value is set 0, check TLS certificate every new connection. | 300 | +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.3```, ```TLSv1.2``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +| httpReverseProxyConfigs | HTTP directs to redirect to non-pulsar services | | +| httpOutputBufferSize | HTTP output buffer size. The amount of data that will be buffered for HTTP requests before it is flushed to the channel. A larger buffer size may result in higher HTTP throughput though it may take longer for the client to see data. If using HTTP streaming via the reverse proxy, this should be set to the minimum value (1) so that clients see the data as soon as possible. | 32768 | +| httpNumThreads | Number of threads to use for HTTP requests processing| 2 * Runtime.getRuntime().availableProcessors() | +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:;base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:;base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`. Note: key file must be DER-encoded.|| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || +|tokenAudienceClaim| The token audience "claim" name, e.g. "aud". It is used to get the audience from token. If it is not set, the audience is not verified. || +| tokenAudience | The token audience stands for this broker. The field `tokenAudienceClaim` of a valid token need contains this parameter.| | +|haProxyProtocolEnabled | Enable or disable the [HAProxy](http://www.haproxy.org/) protocol. |false| +| numIOThreads | Number of threads used for Netty IO.
    **Note:** This configuration is only available in 2.9.3 and later versions. | 2 * Runtime.getRuntime().availableProcessors() | +| numAcceptorThreads | Number of threads used for Netty Acceptor.
    **Note:** This configuration is only available in 2.9.3 and later versions.| 1 | + +#### Configuration Override For Clients Internal to Proxy + +In 2.9.3 and later versions, you can configure some clients by using the appropriate prefix. + +|Prefix|Description| +|---|---| +|brokerClient_| Configure **all** the proxy's Pulsar Clients. These configurations are applied after hard coded configuration and before the above brokerClient configurations named above.| + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|admin.enableServer|The port at which the admin listens.|true| +|admin.serverPort|The port at which the admin listens.|9990| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|forceSync|Requires updates to be synced to media of the transaction log before finishing processing the update. If this option is set to 'no', ZooKeeper will not require updates to be synced to the media. WARNING: it's not recommended to run a production ZK cluster with `forceSync` disabled.|yes| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties + +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 + +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration diff --git a/site2/website/versioned_docs/version-2.9.x/reference-connector-admin.md b/site2/website/versioned_docs/version-2.9.x/reference-connector-admin.md new file mode 100644 index 0000000000000..f1240bf8db17d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-connector-admin.md @@ -0,0 +1,12 @@ +--- +id: reference-connector-admin +title: Connector Admin CLI +sidebar_label: "Connector Admin CLI" +original_id: reference-connector-admin +--- + +> **Important** +> +> For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/). +> + diff --git a/site2/website/versioned_docs/version-2.9.x/reference-metrics.md b/site2/website/versioned_docs/version-2.9.x/reference-metrics.md new file mode 100644 index 0000000000000..96c378c3c5dc4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-metrics.md @@ -0,0 +1,556 @@ +--- +id: reference-metrics +title: Pulsar Metrics +sidebar_label: "Pulsar Metrics" +original_id: reference-metrics +--- + + + +Pulsar exposes the following metrics in Prometheus format. You can monitor your clusters with those metrics. + +* [ZooKeeper](#zookeeper) +* [BookKeeper](#bookkeeper) +* [Broker](#broker) +* [Pulsar Functions](#pulsar-functions) +* [Proxy](#proxy) +* [Pulsar SQL Worker](#pulsar-sql-worker) +* [Pulsar transaction](#pulsar-transaction) + +The following types of metrics are available: + +- [Counter](https://prometheus.io/docs/concepts/metric_types/#counter): a cumulative metric that represents a single monotonically increasing counter. The value increases by default. You can reset the value to zero or restart your cluster. +- [Gauge](https://prometheus.io/docs/concepts/metric_types/#gauge): a metric that represents a single numerical value that can arbitrarily go up and down. +- [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram): a histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. +- [Summary](https://prometheus.io/docs/concepts/metric_types/#summary): similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window. + +## ZooKeeper + +The ZooKeeper metrics are exposed under "/metrics" at port `8000`. You can use a different port by configuring the `metricsProvider.httpPort` in conf/zookeeper.conf. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| znode_count | Gauge | The number of z-nodes stored. | +| approximate_data_size | Gauge | The approximate size of all of z-nodes stored. | +| num_alive_connections | Gauge | The number of currently lived connections. | +| watch_count | Gauge | The number of watchers registered. | +| ephemerals_count | Gauge | The number of ephemeral z-nodes. | + +### Request metrics + +| Name | Type | Description | +|---|---|---| +| request_commit_queued | Counter | The total number of requests already committed by a particular server. | +| updatelatency | Summary | The update requests latency calculated in milliseconds. | +| readlatency | Summary | The read requests latency calculated in milliseconds. | + +## BookKeeper + +The BookKeeper metrics are exposed under "/metrics" at port `8000`. You can change the port by updating `prometheusStatsHttpPort` +in the `bookkeeper.conf` configuration file. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| bookie_SERVER_STATUS | Gauge | The server status for bookie server.
    • 1: the bookie is running in writable mode.
    • 0: the bookie is running in readonly mode.
    | +| bookkeeper_server_ADD_ENTRY_count | Counter | The total number of ADD_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_count | Counter | The total number of READ_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_WRITE_BYTES | Counter | The total number of bytes written to the bookie. | +| bookie_READ_BYTES | Counter | The total number of bytes read from the bookie. | +| bookkeeper_server_ADD_ENTRY_REQUEST | Summary | The summary of request latency of ADD_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_REQUEST | Summary | The summary of request latency of READ_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | + +### Journal metrics + +| Name | Type | Description | +|---|---|---| +| bookie_journal_JOURNAL_SYNC_count | Counter | The total number of journal fsync operations happening at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_journal_JOURNAL_QUEUE_SIZE | Gauge | The total number of requests pending in the journal queue. | +| bookie_journal_JOURNAL_FORCE_WRITE_QUEUE_SIZE | Gauge | The total number of force write (fsync) requests pending in the force-write queue. | +| bookie_journal_JOURNAL_CB_QUEUE_SIZE | Gauge | The total number of callbacks pending in the callback queue. | +| bookie_journal_JOURNAL_ADD_ENTRY | Summary | The summary of request latency of adding entries to the journal. | +| bookie_journal_JOURNAL_SYNC | Summary | The summary of fsync latency of syncing data to the journal disk. | + +### Storage metrics + +| Name | Type | Description | +|---|---|---| +| bookie_ledgers_count | Gauge | The total number of ledgers stored in the bookie. | +| bookie_entries_count | Gauge | The total number of entries stored in the bookie. | +| bookie_write_cache_size | Gauge | The bookie write cache size (in bytes). | +| bookie_read_cache_size | Gauge | The bookie read cache size (in bytes). | +| bookie_DELETED_LEDGER_COUNT | Counter | The total number of ledgers deleted since the bookie has started. | +| bookie_ledger_writable_dirs | Gauge | The number of writable directories in the bookie. | + +## Broker + +The broker metrics are exposed under "/metrics" at port `8080`. You can change the port by updating `webServicePort` to a different port +in the `broker.conf` configuration file. + +All the metrics exposed by a broker are labelled with `cluster=${pulsar_cluster}`. The name of Pulsar cluster is the value of `${pulsar_cluster}`, which you have configured in the `broker.conf` file. + +The following metrics are available for broker: + +- [ZooKeeper](#zookeeper) + - [Server metrics](#server-metrics) + - [Request metrics](#request-metrics) +- [BookKeeper](#bookkeeper) + - [Server metrics](#server-metrics-1) + - [Journal metrics](#journal-metrics) + - [Storage metrics](#storage-metrics) +- [Broker](#broker) + - [Namespace metrics](#namespace-metrics) + - [Replication metrics](#replication-metrics) + - [Topic metrics](#topic-metrics) + - [Replication metrics](#replication-metrics-1) + - [ManagedLedgerCache metrics](#managedledgercache-metrics) + - [ManagedLedger metrics](#managedledger-metrics) + - [LoadBalancing metrics](#loadbalancing-metrics) + - [BundleUnloading metrics](#bundleunloading-metrics) + - [BundleSplit metrics](#bundlesplit-metrics) + - [Subscription metrics](#subscription-metrics) + - [Consumer metrics](#consumer-metrics) + - [Managed ledger bookie client metrics](#managed-ledger-bookie-client-metrics) + - [Token metrics](#token-metrics) + - [Authentication metrics](#authentication-metrics) + - [Connection metrics](#connection-metrics) +- [Pulsar Functions](#pulsar-functions) +- [Proxy](#proxy) +- [Pulsar SQL Worker](#pulsar-sql-worker) +- [Pulsar transaction](#pulsar-transaction) + +### Namespace metrics + +> Namespace metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `false`. + +All the namespace metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_topics_count | Gauge | The number of Pulsar topics of the namespace owned by this broker. | +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the namespace served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the namespace connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the namespace connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the namespace coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the namespace going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the namespace coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the namespace going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this namespace owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size of topics in the namespace owned by the broker without replicas (in bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this namespace owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this namespace offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this namespace (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this namespace (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a namespace that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a namespace that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | + +#### Replication metrics + +If a namespace is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are also labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the namespace replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the namespace replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the namespace replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the namespace replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the namespace replicating to remote cluster (messages). | +| pulsar_replication_rate_expired | Gauge | Total rate of messages expired (messages/second). | +| pulsar_replication_connected_count | Gauge | The count of replication-subscriber up and running to replicate to remote cluster. | +| pulsar_replication_delay_in_seconds | Gauge | Time in seconds from the time a message was produced to the time when it is about to be replicated. | + + +### Topic metrics + +> Topic metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the topic metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the topic served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the topic connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the topic connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the topic coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the topic going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the topic coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the topic going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this topic owned by this broker (bytes). | +| pulsar_storage_logical_size | Gauge | The storage size of topics in the namespace owned by the broker without replicas (in bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this topic owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this topic offloaded to the tiered storage (bytes). | +| pulsar_storage_backlog_quota_limit | Gauge | The total amount of the data in this topic that limit the backlog quota (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this topic (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this topic (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a topic that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a topic that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | +| pulsar_in_bytes_total | Counter | The total number of messages in bytes received for this topic. | +| pulsar_in_messages_total | Counter | The total number of messages received for this topic. | +| pulsar_out_bytes_total | Counter | The total number of messages in bytes read from this topic. | +| pulsar_out_messages_total | Counter | The total number of messages read from this topic. | +| pulsar_compaction_removed_event_count | Gauge | The total number of removed events of the compaction. | +| pulsar_compaction_succeed_count | Gauge | The total number of successes of the compaction. | +| pulsar_compaction_failed_count | Gauge | The total number of failures of the compaction. | +| pulsar_compaction_duration_time_in_mills | Gauge | The duration time of the compaction. | +| pulsar_compaction_read_throughput | Gauge | The read throughput of the compaction. | +| pulsar_compaction_write_throughput | Gauge | The write throughput of the compaction. | +| pulsar_compaction_latency_le_* | Histogram | The compaction latency with given quantile.
    Available thresholds:
    • pulsar_compaction_latency_le_0_5: <= 0.5ms
    • pulsar_compaction_latency_le_1: <= 1ms
    • pulsar_compaction_latency_le_5: <= 5ms
    • pulsar_compaction_latency_le_10: <= 10ms
    • pulsar_compaction_latency_le_20: <= 20ms
    • pulsar_compaction_latency_le_50: <= 50ms
    • pulsar_compaction_latency_le_100: <= 100ms
    • pulsar_compaction_latency_le_200: <= 200ms
    • pulsar_compaction_latency_le_1000: <= 1s
    • pulsar_compaction_latency_le_overflow: > 1s
    | +| pulsar_compaction_compacted_entries_count | Gauge | The total number of the compacted entries. | +| pulsar_compaction_compacted_entries_size |Gauge | The total size of the compacted entries. | + +#### Replication metrics + +If a namespace that a topic belongs to is configured to be replicated among multiple Pulsar clusters, the corresponding replication metrics is also exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics are labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the topic replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the topic replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the topic replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the topic replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the topic replicating to remote cluster (messages). | + +### ManagedLedgerCache metrics +All the ManagedLedgerCache metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_cache_evictions | Gauge | The number of cache evictions during the last minute. | +| pulsar_ml_cache_hits_rate | Gauge | The number of cache hits per second. | +| pulsar_ml_cache_hits_throughput | Gauge | The amount of data is retrieved from the cache in byte/s | +| pulsar_ml_cache_misses_rate | Gauge | The number of cache misses per second | +| pulsar_ml_cache_misses_throughput | Gauge | The amount of data is retrieved from the cache in byte/s | +| pulsar_ml_cache_pool_active_allocations | Gauge | The number of currently active allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_huge | Gauge | The number of currently active huge allocation in direct arena | +| pulsar_ml_cache_pool_active_allocations_normal | Gauge | The number of currently active normal allocations in direct arena | +| pulsar_ml_cache_pool_active_allocations_small | Gauge | The number of currently active small allocations in direct arena | +| pulsar_ml_cache_pool_allocated | Gauge | The total allocated memory of chunk lists in direct arena | +| pulsar_ml_cache_pool_used | Gauge | The total used memory of chunk lists in direct arena | +| pulsar_ml_cache_used_size | Gauge | The size in byte used to store the entries payloads | +| pulsar_ml_count | Gauge | The number of currently opened managed ledgers | + +### ManagedLedger metrics +All the managedLedger metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- namespace: namespace=${pulsar_namespace}. ${pulsar_namespace} is the namespace name. +- quantile: quantile=${quantile}. Quantile is only for `Histogram` type metric, and represents the threshold for given Buckets. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_ml_AddEntryBytesRate | Gauge | The bytes/s rate of messages added | +| pulsar_ml_AddEntryWithReplicasBytesRate | Gauge | The bytes/s rate of messages added with replicas | +| pulsar_ml_AddEntryErrors | Gauge | The number of addEntry requests that failed | +| pulsar_ml_AddEntryLatencyBuckets | Histogram | The latency of adding a ledger entry with a given quantile (threshold), including time spent on waiting in queue on the broker side
    Available quantile:
    • quantile="0.0_0.5" is AddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is AddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is AddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is AddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is AddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is AddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is AddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is AddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is AddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_AddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the AddEntryLatency is longer than 1 second | +| pulsar_ml_AddEntryMessagesRate | Gauge | The msg/s rate of messages added | +| pulsar_ml_AddEntrySucceed | Gauge | The number of addEntry requests that succeeded | +| pulsar_ml_EntrySizeBuckets | Histogram | The added entry size of a ledger with a given quantile.
    Available quantile:
    • quantile="0.0_128.0" is EntrySize between (0byte, 128byte]
    • quantile="128.0_512.0" is EntrySize between (128byte, 512byte]
    • quantile="512.0_1024.0" is EntrySize between (512byte, 1KB]
    • quantile="1024.0_2048.0" is EntrySize between (1KB, 2KB]
    • quantile="2048.0_4096.0" is EntrySize between (2KB, 4KB]
    • quantile="4096.0_16384.0" is EntrySize between (4KB, 16KB]
    • quantile="16384.0_102400.0" is EntrySize between (16KB, 100KB]
    • quantile="102400.0_1232896.0" is EntrySize between (100KB, 1MB]
    | +| pulsar_ml_EntrySizeBuckets_OVERFLOW |Gauge | The number of times the EntrySize is larger than 1MB | +| pulsar_ml_LedgerSwitchLatencyBuckets | Histogram | The ledger switch latency with a given quantile.
    Available quantile:
    • quantile="0.0_0.5" is EntrySize between (0ms, 0.5ms]
    • quantile="0.5_1.0" is EntrySize between (0.5ms, 1ms]
    • quantile="1.0_5.0" is EntrySize between (1ms, 5ms]
    • quantile="5.0_10.0" is EntrySize between (5ms, 10ms]
    • quantile="10.0_20.0" is EntrySize between (10ms, 20ms]
    • quantile="20.0_50.0" is EntrySize between (20ms, 50ms]
    • quantile="50.0_100.0" is EntrySize between (50ms, 100ms]
    • quantile="100.0_200.0" is EntrySize between (100ms, 200ms]
    • quantile="200.0_1000.0" is EntrySize between (200ms, 1000ms]
    | +| pulsar_ml_LedgerSwitchLatencyBuckets_OVERFLOW | Gauge | The number of times the ledger switch latency is longer than 1 second | +| pulsar_ml_LedgerAddEntryLatencyBuckets | Histogram | The latency for bookie client to persist a ledger entry from broker to BookKeeper service with a given quantile (threshold).
    Available quantile:
    • quantile="0.0_0.5" is LedgerAddEntryLatency between (0.0ms, 0.5ms]
    • quantile="0.5_1.0" is LedgerAddEntryLatency between (0.5ms, 1.0ms]
    • quantile="1.0_5.0" is LedgerAddEntryLatency between (1ms, 5ms]
    • quantile="5.0_10.0" is LedgerAddEntryLatency between (5ms, 10ms]
    • quantile="10.0_20.0" is LedgerAddEntryLatency between (10ms, 20ms]
    • quantile="20.0_50.0" is LedgerAddEntryLatency between (20ms, 50ms]
    • quantile="50.0_100.0" is LedgerAddEntryLatency between (50ms, 100ms]
    • quantile="100.0_200.0" is LedgerAddEntryLatency between (100ms, 200ms]
    • quantile="200.0_1000.0" is LedgerAddEntryLatency between (200ms, 1s]
    | +| pulsar_ml_LedgerAddEntryLatencyBuckets_OVERFLOW | Gauge | The number of times the LedgerAddEntryLatency is longer than 1 second | +| pulsar_ml_MarkDeleteRate | Gauge | The rate of mark-delete ops/s | +| pulsar_ml_NumberOfMessagesInBacklog | Gauge | The number of backlog messages for all the consumers | +| pulsar_ml_ReadEntriesBytesRate | Gauge | The bytes/s rate of messages read | +| pulsar_ml_ReadEntriesErrors | Gauge | The number of readEntries requests that failed | +| pulsar_ml_ReadEntriesRate | Gauge | The msg/s rate of messages read | +| pulsar_ml_ReadEntriesSucceeded | Gauge | The number of readEntries requests that succeeded | +| pulsar_ml_StoredMessagesSize | Gauge | The total size of the messages in active ledgers (accounting for the multiple copies stored) | + +### Managed cursor acknowledgment state + +The acknowledgment state is persistent to the ledger first. When the acknowledgment state fails to be persistent to the ledger, they are persistent to ZooKeeper. To track the stats of acknowledgment, you can configure the metrics for the managed cursor. + +All the cursor acknowledgment state metrics are labelled with the following labels: + +- namespace: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +- ledger_name: `ledger_name=${pulsar_ledger_name}`. `${pulsar_ledger_name}` is the ledger name. + +- cursor_name: `ledger_name=${pulsar_cursor_name}`. `${pulsar_cursor_name}` is the cursor name. + +Name |Type |Description +|---|---|--- +brk_ml_cursor_persistLedgerSucceed(namespace=", ledger_name="", cursor_name:")|Gauge|The number of acknowledgment states that is persistent to a ledger.| +brk_ml_cursor_persistLedgerErrors(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to the ledger.| +brk_ml_cursor_persistZookeeperSucceed(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of acknowledgment states that is persistent to ZooKeeper. +brk_ml_cursor_persistZookeeperErrors(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of ledger errors occurred when acknowledgment states fail to be persistent to ZooKeeper. +brk_ml_cursor_nonContiguousDeletedMessagesRange(namespace="", ledger_name="", cursor_name:"")|Gauge|The number of non-contiguous deleted messages ranges. +brk_ml_cursor_writeLedgerSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of write to ledger. +brk_ml_cursor_writeLedgerLogicalSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of write to ledger (accounting for without replicas). +brk_ml_cursor_readLedgerSize(namespace="", ledger_name="", cursor_name:"")|Gauge|The size of read from ledger. + +### LoadBalancing metrics +All the loadbalancing metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- broker: broker=${broker}. ${broker} is the IP address of the broker +- metric: metric="loadBalancing". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bandwidth_in_usage | Gauge | The broker bandwith in usage | +| pulsar_lb_bandwidth_out_usage | Gauge | The broker bandwith out usage | +| pulsar_lb_cpu_usage | Gauge | The broker cpu usage | +| pulsar_lb_directMemory_usage | Gauge | The broker process direct memory usage | +| pulsar_lb_memory_usage | Gauge | The broker process memory usage | + +#### BundleUnloading metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundleUnloading". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_unload_broker_count | Counter | Unload broker count in this bundle unloading | +| pulsar_lb_unload_bundle_count | Counter | Bundle unload count in this bundle unloading | + +#### BundleSplit metrics +All the bundleUnloading metrics are labelled with the following labels: +- cluster: cluster=${pulsar_cluster}. ${pulsar_cluster} is the cluster name that you have configured in the `broker.conf` file. +- metric: metric="bundlesSplit". + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_lb_bundles_split_count | Counter | bundle split count in this bundle splitting check interval | + +### Subscription metrics + +> Subscription metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `true`. + +All the subscription metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscription_back_log | Gauge | The total backlog of a subscription (messages). | +| pulsar_subscription_delayed | Gauge | The total number of messages are delayed to be dispatched for a subscription (messages). | +| pulsar_subscription_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_subscription_unacked_messages | Gauge | The total number of unacknowledged messages of a subscription (messages). | +| pulsar_subscription_blocked_on_unacked_messages | Gauge | Indicate whether a subscription is blocked on unacknowledged messages or not.
    • 1 means the subscription is blocked on waiting unacknowledged messages to be acked.
    • 0 means the subscription is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_subscription_msg_rate_out | Gauge | The total message dispatch rate for a subscription (messages/second). | +| pulsar_subscription_msg_throughput_out | Gauge | The total message dispatch throughput for a subscription (bytes/second). | + +### Consumer metrics + +> Consumer metrics are only exposed when both `exposeTopicLevelMetricsInPrometheus` and `exposeConsumerLevelMetricsInPrometheus` are set to `true`. + +All the consumer metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. +- *consumer_name*: `consumer_name=${consumer_name}`. `${consumer_name}` is the topic consumer name. +- *consumer_id*: `consumer_id=${consumer_id}`. `${consumer_id}` is the topic consumer id. + +| Name | Type | Description | +|---|---|---| +| pulsar_consumer_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_consumer_unacked_messages | Gauge | The total number of unacknowledged messages of a consumer (messages). | +| pulsar_consumer_blocked_on_unacked_messages | Gauge | Indicate whether a consumer is blocked on unacknowledged messages or not.
    • 1 means the consumer is blocked on waiting unacknowledged messages to be acked.
    • 0 means the consumer is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_consumer_msg_rate_out | Gauge | The total message dispatch rate for a consumer (messages/second). | +| pulsar_consumer_msg_throughput_out | Gauge | The total message dispatch throughput for a consumer (bytes/second). | +| pulsar_consumer_available_permits | Gauge | The available permits for for a consumer. | + +### Managed ledger bookie client metrics + +All the managed ledger bookie client metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +| --- | --- | --- | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_completed_tasks_* | Gauge | The number of tasks the scheduler executor execute completed.
    The number of metrics determined by the scheduler executor thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_queue_* | Gauge | The number of tasks queued in the scheduler executor's queue.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_total_tasks_* | Gauge | The total number of tasks the scheduler executor received.
    The number of metrics determined by scheduler executor's thread number configured by `managedLedgerNumSchedulerThreads` in `broker.conf`.
    | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_execution | Summary | The scheduler task execution latency calculated in milliseconds. | +| pulsar_managedLedger_client_bookkeeper_ml_scheduler_task_queued | Summary | The scheduler task queued latency calculated in milliseconds. | + +### Token metrics + +All the token metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. + +| Name | Type | Description | +|---|---|---| +| pulsar_expired_token_count | Counter | The number of expired tokens in Pulsar. | +| pulsar_expiring_token_minutes | Histogram | The remaining time of expiring tokens in minutes. | + +### Authentication metrics + +All the authentication metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *provider_name*: `provider_name=${provider_name}`. `${provider_name}` is the class name of the authentication provider. +- *auth_method*: `auth_method=${auth_method}`. `${auth_method}` is the authentication method of the authentication provider. +- *reason*: `reason=${reason}`. `${reason}` is the reason for failing authentication operation. (This label is only for `pulsar_authentication_failures_count`.) + +| Name | Type | Description | +|---|---|---| +| pulsar_authentication_success_count| Counter | The number of successful authentication operations. | +| pulsar_authentication_failures_count | Counter | The number of failing authentication operations. | + +### Connection metrics + +All the connection metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *broker*: `broker=${advertised_address}`. `${advertised_address}` is the advertised address of the broker. +- *metric*: `metric=${metric}`. `${metric}` is the connection metric collective name. + +| Name | Type | Description | +|---|---|---| +| pulsar_active_connections| Gauge | The number of active connections. | +| pulsar_connection_created_total_count | Gauge | The total number of connections. | +| pulsar_connection_create_success_count | Gauge | The number of successfully created connections. | +| pulsar_connection_create_fail_count | Gauge | The number of failed connections. | +| pulsar_connection_closed_total_count | Gauge | The total number of closed connections. | +| pulsar_broker_throttled_connections | Gauge | The number of throttled connections. | +| pulsar_broker_throttled_connections_global_limit | Gauge | The number of throttled connections because of per-connection limit. | + +## Pulsar Functions + +All the Pulsar Functions metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_function_processed_successfully_total | Counter | The total number of messages processed successfully. | +| pulsar_function_processed_successfully_total_1min | Counter | The total number of messages processed successfully in the last 1 minute. | +| pulsar_function_system_exceptions_total | Counter | The total number of system exceptions. | +| pulsar_function_system_exceptions_total_1min | Counter | The total number of system exceptions in the last 1 minute. | +| pulsar_function_user_exceptions_total | Counter | The total number of user exceptions. | +| pulsar_function_user_exceptions_total_1min | Counter | The total number of user exceptions in the last 1 minute. | +| pulsar_function_process_latency_ms | Summary | The process latency in milliseconds. | +| pulsar_function_process_latency_ms_1min | Summary | The process latency in milliseconds in the last 1 minute. | +| pulsar_function_last_invocation | Gauge | The timestamp of the last invocation of the function. | +| pulsar_function_received_total | Counter | The total number of messages received from source. | +| pulsar_function_received_total_1min | Counter | The total number of messages received from source in the last 1 minute. | +pulsar_function_user_metric_ | Summary|The user-defined metrics. + +## Connectors + +All the Pulsar connector metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +Connector metrics contain **source** metrics and **sink** metrics. + +- **Source** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_source_written_total|Counter|The total number of records written to a Pulsar topic. + pulsar_source_written_total_1min|Counter|The total number of records written to a Pulsar topic in the last 1 minute. + pulsar_source_received_total|Counter|The total number of records received from source. + pulsar_source_received_total_1min|Counter|The total number of records received from source in the last 1 minute. + pulsar_source_last_invocation|Gauge|The timestamp of the last invocation of the source. + pulsar_source_source_exception|Gauge|The exception from a source. + pulsar_source_source_exceptions_total|Counter|The total number of source exceptions. + pulsar_source_source_exceptions_total_1min |Counter|The total number of source exceptions in the last 1 minute. + pulsar_source_system_exception|Gauge|The exception from system code. + pulsar_source_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_source_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_source_user_metric_ | Summary|The user-defined metrics. + +- **Sink** metrics + + | Name | Type | Description | + |---|---|---| + pulsar_sink_written_total|Counter| The total number of records processed by a sink. + pulsar_sink_written_total_1min|Counter| The total number of records processed by a sink in the last 1 minute. + pulsar_sink_received_total_1min|Counter| The total number of messages that a sink has received from Pulsar topics in the last 1 minute. + pulsar_sink_received_total|Counter| The total number of records that a sink has received from Pulsar topics. + pulsar_sink_last_invocation|Gauge|The timestamp of the last invocation of the sink. + pulsar_sink_sink_exception|Gauge|The exception from a sink. + pulsar_sink_sink_exceptions_total|Counter|The total number of sink exceptions. + pulsar_sink_sink_exceptions_total_1min |Counter|The total number of sink exceptions in the last 1 minute. + pulsar_sink_system_exception|Gauge|The exception from system code. + pulsar_sink_system_exceptions_total|Counter|The total number of system exceptions. + pulsar_sink_system_exceptions_total_1min|Counter|The total number of system exceptions in the last 1 minute. + pulsar_sink_user_metric_ | Summary|The user-defined metrics. + +## Proxy + +All the proxy metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *kubernetes_pod_name*: `kubernetes_pod_name=${kubernetes_pod_name}`. `${kubernetes_pod_name}` is the Kubernetes pod name. + +| Name | Type | Description | +|---|---|---| +| pulsar_proxy_active_connections | Gauge | Number of connections currently active in the proxy. | +| pulsar_proxy_new_connections | Counter | Counter of connections being opened in the proxy. | +| pulsar_proxy_rejected_connections | Counter | Counter for connections rejected due to throttling. | +| pulsar_proxy_binary_ops | Counter | Counter of proxy operations. | +| pulsar_proxy_binary_bytes | Counter | Counter of proxy bytes. | + +## Pulsar SQL Worker + +| Name | Type | Description | +|---|---|---| +| split_bytes_read | Counter | Number of bytes read from BookKeeper. | +| split_num_messages_deserialized | Counter | Number of messages deserialized. | +| split_num_record_deserialized | Counter | Number of records deserialized. | +| split_bytes_read_per_query | Summary | Total number of bytes read per query. | +| split_entry_deserialize_time | Summary | Time spent on derserializing entries. | +| split_entry_deserialize_time_per_query | Summary | Time spent on derserializing entries per query. | +| split_entry_queue_dequeue_wait_time | Summary | Time spend on waiting to get entry from entry queue because it is empty. | +| split_entry_queue_dequeue_wait_time_per_query | Summary | Total time spent on waiting to get entry from entry queue per query. | +| split_message_queue_dequeue_wait_time_per_query | Summary | Time spent on waiting to dequeue from message queue because is is empty per query. | +| split_message_queue_enqueue_wait_time | Summary | Time spent on waiting for message queue enqueue because the message queue is full. | +| split_message_queue_enqueue_wait_time_per_query | Summary | Time spent on waiting for message queue enqueue because the message queue is full per query. | +| split_num_entries_per_batch | Summary | Number of entries per batch. | +| split_num_entries_per_query | Summary | Number of entries per query. | +| split_num_messages_deserialized_per_entry | Summary | Number of messages deserialized per entry. | +| split_num_messages_deserialized_per_query | Summary | Number of messages deserialized per query. | +| split_read_attempts | Summary | Number of read attempts (fail if queues are full). | +| split_read_attempts_per_query | Summary | Number of read attempts per query. | +| split_read_latency_per_batch | Summary | Latency of reads per batch. | +| split_read_latency_per_query | Summary | Total read latency per query. | +| split_record_deserialize_time | Summary | Time spent on deserializing message to record. For example, Avro, JSON, and so on. | +| split_record_deserialize_time_per_query | Summary | Time spent on deserializing message to record per query. | +| split_total_execution_time | Summary | The total execution time. | + +## Pulsar transaction + +All the transaction metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you have configured in the `broker.conf` file. +- *coordinator_id*: `coordinator_id=${coordinator_id}`. `${coordinator_id}` is the coordinator id. + +| Name | Type | Description | +|---|---|---| +| pulsar_txn_active_count | Gauge | Number of active transactions. | +| pulsar_txn_created_count | Counter | Number of created transactions. | +| pulsar_txn_committed_count | Counter | Number of committed transactions. | +| pulsar_txn_aborted_count | Counter | Number of aborted transactions of this coordinator. | +| pulsar_txn_timeout_count | Counter | Number of timeout transactions. | +| pulsar_txn_append_log_count | Counter | Number of append transaction logs. | +| pulsar_txn_execution_latency_le_* | Histogram | Transaction execution latency.
    Available latencies are as below:
    • latency="10" is TransactionExecutionLatency between (0ms, 10ms]
    • latency="20" is TransactionExecutionLatency between (10ms, 20ms]
    • latency="50" is TransactionExecutionLatency between (20ms, 50ms]
    • latency="100" is TransactionExecutionLatency between (50ms, 100ms]
    • latency="500" is TransactionExecutionLatency between (100ms, 500ms]
    • latency="1000" is TransactionExecutionLatency between (500ms, 1000ms]
    • latency="5000" is TransactionExecutionLatency between (1s, 5s]
    • latency="15000" is TransactionExecutionLatency between (5s, 15s]
    • latency="30000" is TransactionExecutionLatency between (15s, 30s]
    • latency="60000" is TransactionExecutionLatency between (30s, 60s]
    • latency="300000" is TransactionExecutionLatency between (1m,5m]
    • latency="1500000" is TransactionExecutionLatency between (5m,15m]
    • latency="3000000" is TransactionExecutionLatency between (15m,30m]
    • latency="overflow" is TransactionExecutionLatency between (30m,∞]
    | diff --git a/site2/website/versioned_docs/version-2.9.x/reference-pulsar-admin.md b/site2/website/versioned_docs/version-2.9.x/reference-pulsar-admin.md new file mode 100644 index 0000000000000..e306289a8798a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-pulsar-admin.md @@ -0,0 +1,3297 @@ +--- +id: reference-pulsar-admin +title: Pulsar admin CLI +sidebar_label: "Pulsar Admin CLI" +original_id: reference-pulsar-admin +--- + +> **Important** +> +> This page is deprecated and not updated anymore. For the latest and complete information about `Pulsar admin`, including commands, flags, descriptions, and more, see [Pulsar admin doc](https://pulsar.apache.org/tools/pulsar-admin/) + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage + +```bash + +$ pulsar-admin command + +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `functions-worker` +* `namespaces` +* `ns-isolation-policy` +* `sources` + + For more information, see [here](io-cli.md#sources) +* `sinks` + + For more information, see [here](io-cli.md#sinks) +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash + +$ pulsar-admin broker-stats subcommand + +``` + +Subcommands +* `allocator-stats` +* `topics(destinations)` +* `mbeans` +* `monitoring-metrics` +* `load-report` + + +### `allocator-stats` + +Dump allocator stats + +Usage + +```bash + +$ pulsar-admin broker-stats allocator-stats allocator-name + +``` + +### `topics(destinations)` + +Dump topic stats + +Usage + +```bash + +$ pulsar-admin broker-stats topics options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage + +```bash + +$ pulsar-admin broker-stats mbeans options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage + +```bash + +$ pulsar-admin broker-stats monitoring-metrics options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `load-report` + +Dump broker load-report + +Usage + +```bash + +$ pulsar-admin broker-stats load-report + +``` + +## `brokers` + +Operations about brokers + +```bash + +$ pulsar-admin brokers subcommand + +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` +* `get-runtime-config` +* `healthcheck` + +### `list` +List active brokers of the cluster + +Usage + +```bash + +$ pulsar-admin brokers list cluster-name + +``` + +### `leader-broker` +Get the information of the leader broker + +Usage + +```bash + +$ pulsar-admin brokers leader-broker + +``` + +### `namespaces` +List namespaces owned by the broker + +Usage + +```bash + +$ pulsar-admin brokers namespaces cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage + +```bash + +$ pulsar-admin brokers update-dynamic-config options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage + +```bash + +$ pulsar-admin brokers list-dynamic-config + +``` + +### `delete-dynamic-config` +Delete dynamic-serviceConfiguration of broker + +Usage + +```bash + +$ pulsar-admin brokers delete-dynamic-config options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| + + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-all-dynamic-config + +``` + +### `get-internal-config` +Get internal configuration information + +Usage + +```bash + +$ pulsar-admin brokers get-internal-config + +``` + +### `get-runtime-config` +Get runtime configuration values + +Usage + +```bash + +$ pulsar-admin brokers get-runtime-config + +``` + +### `healthcheck` +Run a health check against the broker + +Usage + +```bash + +$ pulsar-admin brokers healthcheck + +``` + +## `clusters` +Operations about clusters + +Usage + +```bash + +$ pulsar-admin clusters subcommand + +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` +* `get-peer-clusters` +* `get-failure-domain` +* `create-failure-domain` +* `update-failure-domain` +* `delete-failure-domain` +* `list-failure-domains` + + +### `get` +Get the configuration data for the specified cluster + +Usage + +```bash + +$ pulsar-admin clusters get cluster-name + +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin clusters create cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage + +```bash + +$ pulsar-admin clusters update cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage + +```bash + +$ pulsar-admin clusters delete cluster-name + +``` + +### `list` +List the existing clusters + +Usage + +```bash + +$ pulsar-admin clusters list + +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage + +```bash + +$ pulsar-admin clusters update-peer-clusters cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--peer-clusters`|Comma separated peer cluster names (Pass empty string "" to delete list)|| + +### `get-peer-clusters` +Get list of peer clusters + +Usage + +```bash + +$ pulsar-admin clusters get-peer-clusters + +``` + +### `get-failure-domain` +Get the configuration brokers of a failure domain + +Usage + +```bash + +$ pulsar-admin clusters get-failure-domain cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `create-failure-domain` +Create a new failure domain for a cluster (updates it if already created) + +Usage + +```bash + +$ pulsar-admin clusters create-failure-domain cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `update-failure-domain` +Update failure domain for a cluster (creates a new one if not exist) + +Usage + +```bash + +$ pulsar-admin clusters update-failure-domain cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `delete-failure-domain` +Delete an existing failure domain + +Usage + +```bash + +$ pulsar-admin clusters delete-failure-domain cluster-name options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `list-failure-domains` +List the existing failure domains for a cluster + +Usage + +```bash + +$ pulsar-admin clusters list-failure-domains cluster-name + +``` + +## `functions` + +A command-line interface for Pulsar Functions + +Usage + +```bash + +$ pulsar-admin functions subcommand + +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `restart` +* `stop` +* `start` +* `status` +* `stats` +* `list` +* `querystate` +* `putstate` +* `trigger` + + +### `localrun` +Run the Pulsar Function locally (rather than deploying it to the Pulsar cluster) + + +Usage + +```bash + +$ pulsar-admin functions localrun options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--broker-service-url `|The URL of the Pulsar broker|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--client-auth-params`|Client authentication param|| +|`--client-auth-plugin`|Client authentication plugin using which function-process can connect to broker|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--hostname-verification-enabled`|Enable hostname verification|false| +|`--instance-id-offset`|Start the instanceIds from this offset|0| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--state-storage-service-url`|The URL for the state storage service. By default, it it set to the service URL of the Apache BookKeeper. This service URL must be added manually when the Pulsar Function runs locally. || +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed successfully are sent|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--tls-allow-insecure`|Allow insecure tls connection|false| +|`--tls-trust-cert-path`|The tls trust cert file path|| +|`--use-tls`|Use tls connection|false| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `create` +Create a Pulsar Function in cluster mode (i.e. deploy it on a Pulsar cluster) + +Usage + +``` + +$ pulsar-admin functions create options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `delete` +Delete a Pulsar Function that's running on a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions delete options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `update` +Update a Pulsar Function that's been deployed to a Pulsar cluster + +Usage + +```bash + +$ pulsar-admin functions update options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--go`|Path to the main Go executable binary for the function (if the function is written in Go). It also supports URL path [http/https/file (file protocol assumes that file already exists on worker host)/function (package URL from packages management service)] from which worker can download the package.|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--retain-key-ordering`|Function consumes and processes messages in key order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--producer-config`| The custom producer configuration (as a JSON string) | | + + +### `get` +Fetch information about a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions get options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `restart` +Restart function instance + +Usage + +```bash + +$ pulsar-admin functions restart options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (restart all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stop` +Stops function instance + +Usage + +```bash + +$ pulsar-admin functions stop options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (stop all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `start` +Starts a stopped function instance + +Usage + +```bash + +$ pulsar-admin functions start options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (start all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `status` +Check the current status of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions status options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-status of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stats` +Get the current stats of a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions stats options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-stats of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + +### `list` +List all of the Pulsar Functions running under a specific tenant and namespace + +Usage + +```bash + +$ pulsar-admin functions list options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `querystate` +Fetch the current state associated with a Pulsar Function running in cluster mode + +Usage + +```bash + +$ pulsar-admin functions querystate options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`-w`, `--watch`|Watch for changes in the value associated with a key for a Pulsar Function|false| + +### `putstate` +Put a key/value pair to the state associated with a Pulsar Function + +Usage + +```bash + +$ pulsar-admin functions putstate options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the Pulsar Function|| +|`--name`|The name of a Pulsar Function|| +|`--namespace`|The namespace of a Pulsar Function|| +|`--tenant`|The tenant of a Pulsar Function|| +|`-s`, `--state`|The FunctionState that needs to be put|| + +### `trigger` +Triggers the specified Pulsar Function with a supplied value + +Usage + +```bash + +$ pulsar-admin functions trigger options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`--topic`|The specific topic name that the function consumes from that you want to inject the data to|| +|`--trigger-file`|The path to the file that contains the data with which you'd like to trigger the function|| +|`--trigger-value`|The value with which you want to trigger the function|| + + +## `functions-worker` +Operations to collect function-worker statistics + +```bash + +$ pulsar-admin functions-worker subcommand + +``` + +Subcommands + +* `function-stats` +* `get-cluster` +* `get-cluster-leader` +* `get-function-assignments` +* `monitoring-metrics` + +### `function-stats` + +Dump all functions stats running on this broker + +Usage + +```bash + +$ pulsar-admin functions-worker function-stats + +``` + +### `get-cluster` + +Get all workers belonging to this cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster + +``` + +### `get-cluster-leader` + +Get the leader of the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-cluster-leader + +``` + +### `get-function-assignments` + +Get the assignments of the functions across the worker cluster + +Usage + +```bash + +$ pulsar-admin functions-worker get-function-assignments + +``` + +### `monitoring-metrics` + +Dump metrics for Monitoring + +Usage + +```bash + +$ pulsar-admin functions-worker monitoring-metrics + +``` + +## `namespaces` + +Operations for managing namespaces + +```bash + +$ pulsar-admin namespaces subcommand + +``` + +Subcommands +* `list` +* `topics` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `set-auto-topic-creation` +* `remove-auto-topic-creation` +* `set-auto-subscription-creation` +* `remove-auto-subscription-creation` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `grant-subscription-permission` +* `revoke-subscription-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-anti-affinity-group` +* `set-anti-affinity-group` +* `get-anti-affinity-namespaces` +* `delete-anti-affinity-group` +* `get-retention` +* `set-retention` +* `unload` +* `split-bundle` +* `set-dispatch-rate` +* `get-dispatch-rate` +* `set-replicator-dispatch-rate` +* `get-replicator-dispatch-rate` +* `set-subscribe-rate` +* `get-subscribe-rate` +* `set-subscription-dispatch-rate` +* `get-subscription-dispatch-rate` +* `clear-backlog` +* `unsubscribe` +* `set-encryption-required` +* `set-delayed-delivery` +* `get-delayed-delivery` +* `set-subscription-auth-mode` +* `get-max-producers-per-topic` +* `set-max-producers-per-topic` +* `get-max-consumers-per-topic` +* `set-max-consumers-per-topic` +* `get-max-consumers-per-subscription` +* `set-max-consumers-per-subscription` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` +* `get-offload-deletion-lag` +* `set-offload-deletion-lag` +* `clear-offload-deletion-lag` +* `get-schema-autoupdate-strategy` +* `set-schema-autoupdate-strategy` +* `set-offload-policies` +* `get-offload-policies` +* `set-max-subscriptions-per-topic` +* `get-max-subscriptions-per-topic` +* `remove-max-subscriptions-per-topic` + + +### `list` +Get the namespaces for a tenant + +Usage + +```bash + +$ pulsar-admin namespaces list tenant-name + +``` + +### `topics` +Get the list of topics for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces topics tenant/namespace + +``` + +### `policies` +Get the configuration policies of a namespace + +Usage + +```bash + +$ pulsar-admin namespaces policies tenant/namespace + +``` + +### `create` +Create a new namespace + +Usage + +```bash + +$ pulsar-admin namespaces create tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundles`|The number of bundles to activate|0| +|`-c`, `--clusters`|List of clusters this namespace will be assigned|| + + +### `delete` +Deletes a namespace. The namespace needs to be empty + +Usage + +```bash + +$ pulsar-admin namespaces delete tenant/namespace + +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-deduplication tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + +### `set-auto-topic-creation` +Enable or disable autoTopicCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-topic-creation tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoTopicCreation on namespace|false| +|`--disable`, `-d`|Disable allowAutoTopicCreation on namespace|false| +|`--type`, `-t`|Type of topic to be auto-created. Possible values: (partitioned, non-partitioned)|non-partitioned| +|`--num-partitions`, `-n`|Default number of partitions of topic to be auto-created, applicable to partitioned topics only|| + +### `remove-auto-topic-creation` +Remove override of autoTopicCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-topic-creation tenant/namespace + +``` + +### `set-auto-subscription-creation` +Enable autoSubscriptionCreation for a namespace, overriding broker settings + +Usage + +```bash + +$ pulsar-admin namespaces set-auto-subscription-creation tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable allowAutoSubscriptionCreation on namespace|false| + +### `remove-auto-subscription-creation` +Remove override of autoSubscriptionCreation for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces remove-auto-subscription-creation tenant/namespace + +``` + +### `permissions` +Get the permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces permissions tenant/namespace + +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces grant-permission tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces revoke-permission tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| + +### `grant-subscription-permission` +Grant permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces grant-subscription-permission tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--roles`|The client roles to which to grant the permissions (comma separated roles)|| +|`--subscription`|The subscription name for which permission will be granted to roles|| + +### `revoke-subscription-permission` +Revoke permissions to access subscription admin-api + +Usage + +```bash + +$ pulsar-admin namespaces revoke-subscription-permission tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| +|`--subscription`|The subscription name for which permission will be revoked to roles|| + +### `set-clusters` +Set replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-clusters tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-clusters tenant/namespace + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-backlog-quotas tenant/namespace + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-backlog-quota tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-lt`, `--limitTime`|Time limit in second, non-positive number for disabling time limit. (for example 3600 for 1 hour)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| +|`-t`, `--type`|Backlog quota type to set. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Example + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limit 2G \ +--policy producer_request_hold + +``` + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--type`|Backlog quota type to remove. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin namespaces remove-backlog-quota tenant/namespace + +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-persistence tenant/namespace + +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-persistence tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorum`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-message-ttl tenant/namespace + +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-message-ttl tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds. When the value is set to `0`, TTL is disabled. TTL is disabled by default. |0| + +### `remove-message-ttl` +Remove the message TTL for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-message-ttl tenant/namespace + +``` + +### `get-anti-affinity-group` +Get Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-group tenant/namespace + +``` + +### `set-anti-affinity-group` +Set Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-anti-affinity-group tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-g`, `--group`|Anti-affinity group name|| + +### `get-anti-affinity-namespaces` +Get Anti-affinity namespaces grouped with the given anti-affinity group name + +Usage + +```bash + +$ pulsar-admin namespaces get-anti-affinity-namespaces options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--cluster`|Cluster name|| +|`-g`, `--group`|Anti-affinity group name|| +|`-p`, `--tenant`|Tenant is only used for authorization. Client has to be admin of any of the tenant to access this api|| + +### `delete-anti-affinity-group` +Remove Anti-affinity group name for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces delete-anti-affinity-group tenant/namespace + +``` + +### `get-retention` +Get the retention policy that is applied to each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-retention tenant/namespace + +``` + +### `set-retention` +Set the retention policy for each topic within the specified namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-retention tenant/namespace + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T) for each topic in the namespace. 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage + +```bash + +$ pulsar-admin namespaces unload tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| + +### `split-bundle` +Split a namespace-bundle from the current serving broker + +Usage + +```bash + +$ pulsar-admin namespaces split-bundle tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-u`, `--unload`|Unload newly split bundles after splitting old bundle|false| + +### `set-dispatch-rate` +Set message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-dispatch-rate tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-dispatch-rate` +Get configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-dispatch-rate tenant/namespace + +``` + +### `set-replicator-dispatch-rate` +Set replicator message-dispatch-rate for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-replicator-dispatch-rate tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-replicator-dispatch-rate` +Get replicator configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-replicator-dispatch-rate tenant/namespace + +``` + +### `set-subscribe-rate` +Set subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscribe-rate tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-sr`, `--subscribe-rate`|The subscribe rate (default -1 will be overwrite if not passed)|-1| +|`-st`, `--subscribe-rate-period`|The subscribe rate period in second type (default 30 second will be overwrite if not passed)|30| + +### `get-subscribe-rate` +Get configured subscribe-rate per consumer for all topics of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-subscribe-rate tenant/namespace + +``` + +### `set-subscription-dispatch-rate` +Set subscription message-dispatch-rate for all subscription of the namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-dispatch-rate tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--sub-msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-subscription-dispatch-rate` +Get subscription configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage + +```bash + +$ pulsar-admin namespaces get-subscription-dispatch-rate tenant/namespace + +``` + +### `clear-backlog` +Clear the backlog for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-backlog tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-force`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces unsubscribe tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-s`, `--sub`|The subscription name|| + +### `set-encryption-required` +Enable or disable message encryption required for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-encryption-required tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable message encryption required|false| +|`-e`, `--enable`|Enable message encryption required|false| + +### `set-delayed-delivery` +Set the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-delayed-delivery tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable delayed delivery messages|false| +|`-e`, `--enable`|Enable delayed delivery messages|false| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `get-delayed-delivery` +Get the delayed delivery policy on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-delayed-delivery-time tenant/namespace + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-t`, `--time`|The tick time for when retrying on delayed delivery messages|1s| + + +### `set-subscription-auth-mode` +Set subscription auth mode on a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-subscription-auth-mode tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-m`, `--subscription-auth-mode`|Subscription authorization mode for Pulsar policies. Valid options are: [None, Prefix]|| + +### `get-max-producers-per-topic` +Get maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-producers-per-topic tenant/namespace + +``` + +### `set-max-producers-per-topic` +Set maxProducersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-producers-per-topic tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-p`, `--max-producers-per-topic`|maxProducersPerTopic for a namespace|0| + +### `get-max-consumers-per-topic` +Get maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-topic tenant/namespace + +``` + +### `set-max-consumers-per-topic` +Set maxConsumersPerTopic for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-topic tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-topic`|maxConsumersPerTopic for a namespace|0| + +### `get-max-consumers-per-subscription` +Get maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-consumers-per-subscription tenant/namespace + +``` + +### `set-max-consumers-per-subscription` +Set maxConsumersPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-consumers-per-subscription tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-subscription`|maxConsumersPerSubscription for a namespace|0| + +### `get-max-unacked-messages-per-subscription` +Get maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-subscription tenant/namespace + +``` + +### `set-max-unacked-messages-per-subscription` +Set maxUnackedMessagesPerSubscription for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-subscription tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-subscription`|maxUnackedMessagesPerSubscription for a namespace|-1| + +### `get-max-unacked-messages-per-consumer` +Get maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-max-unacked-messages-per-consumer tenant/namespace + +``` + +### `set-max-unacked-messages-per-consumer` +Set maxUnackedMessagesPerConsumer for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-max-unacked-messages-per-consumer tenant/namespace options + +``` + +Options + +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-unacked-messages-per-consumer`|maxUnackedMessagesPerConsumer for a namespace|-1| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace + +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-threshold tenant/namespace + +``` + +### `set-offload-threshold` +Set offloadThreshold for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-threshold tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + +### `get-offload-deletion-lag` +Get offloadDeletionLag, in minutes, for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-deletion-lag tenant/namespace + +``` + +### `set-offload-deletion-lag` +Set offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-deletion-lag tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--lag`|Duration to wait after offloading a ledger segment, before deleting the copy of that segment from cluster local storage. (eg: 10m, 5h, 3d, 2w).|-1| + +### `clear-offload-deletion-lag` +Clear offloadDeletionLag for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces clear-offload-deletion-lag tenant/namespace + +``` + +### `get-schema-autoupdate-strategy` +Get the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces get-schema-autoupdate-strategy tenant/namespace + +``` + +### `set-schema-autoupdate-strategy` +Set the schema auto-update strategy for a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-schema-autoupdate-strategy tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--compatibility`|Compatibility level required for new schemas created via a Producer. Possible values (Full, Backward, Forward, None).|Full| +|`-d`, `--disabled`|Disable automatic schema updates.|false| + +### `get-publish-rate` +Get the message publish rate for each topic in a namespace, in bytes as well as messages per second + +Usage + +```bash + +$ pulsar-admin namespaces get-publish-rate tenant/namespace + +``` + +### `set-publish-rate` +Set the message publish rate for each topic in a namespace + +Usage + +```bash + +$ pulsar-admin namespaces set-publish-rate tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-m`, `--msg-publish-rate`|Threshold for number of messages per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| +|`-b`, `--byte-publish-rate`|Threshold for number of bytes per second per topic in the namespace (-1 implies not set, 0 for no limit).|-1| + +### `set-offload-policies` +Set the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-offload-policies tenant/namespace options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-d`, `--driver`|Driver to use to offload old data to long term storage,(Possible values: S3, aws-s3, google-cloud-storage)|| +|`-r`, `--region`|The long term storage region|| +|`-b`, `--bucket`|Bucket to place offloaded ledger into|| +|`-e`, `--endpoint`|Alternative endpoint to connect to|| +|`-i`, `--aws-id`|AWS Credential Id to use when using driver S3 or aws-s3|| +|`-s`, `--aws-secret`|AWS Credential Secret to use when using driver S3 or aws-s3|| +|`-ro`, `--s3-role`|S3 Role used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-rsn`, `--s3-role-session-name`|S3 role session name used for STSAssumeRoleSessionCredentialsProvider using driver S3 or aws-s3|| +|`-mbs`, `--maxBlockSize`|Max block size|64MB| +|`-rbs`, `--readBufferSize`|Read buffer size|1MB| +|`-oat`, `--offloadAfterThreshold`|Offload after threshold size (eg: 1M, 5M)|| +|`-oae`, `--offloadAfterElapsed`|Offload after elapsed in millis (or minutes, hours,days,weeks eg: 100m, 3h, 2d, 5w).|| + +### `get-offload-policies` +Get the offload policy for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-offload-policies tenant/namespace + +``` + +### `set-max-subscriptions-per-topic` +Set the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces set-max-subscriptions-per-topic tenant/namespace + +``` + +### `get-max-subscriptions-per-topic` +Get the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces get-max-subscriptions-per-topic tenant/namespace + +``` + +### `remove-max-subscriptions-per-topic` +Remove the maximum subscription per topic for a namespace. + +Usage + +```bash + +$ pulsar-admin namespaces remove-max-subscriptions-per-topic tenant/namespace + +``` + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy subcommand + +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` +* `brokers` +* `broker` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy get cluster-name policy-name + +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy list cluster-name + +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy delete + +``` + +### `brokers` +List all brokers with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy brokers cluster-name + +``` + +### `broker` +Get broker with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage + +```bash + +$ pulsar-admin ns-isolation-policy broker cluster-name options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--broker`|Broker name to get namespace-isolation policies attached to it|| + +## `topics` +Operations for managing Pulsar topics (both persistent and non-persistent). + +Usage + +```bash + +$ pulsar-admin topics subcommand + +``` + +From Pulsar 2.7.0, some namespace-level policies are available on topic level. To enable topic-level policy in Pulsar, you need to configure the following parameters in the `broker.conf` file. + +```shell + +systemTopicEnabled=true +topicLevelPoliciesEnabled=true + +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `create-missed-partitions` +* `delete-partitioned-topic` +* `create` +* `get-partitioned-topic-metadata` +* `update-partitioned-topic` +* `list-partitioned-topics` +* `list` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `create-subscription` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `partitioned-stats-internal` +* `skip` +* `clear-backlog` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` +* `get-message-by-id` +* `last-message-id` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `remove-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `remove-message-ttl` +* `get-deduplication` +* `set-deduplication` +* `remove-deduplication` +* `get-retention` +* `set-retention` +* `remove-retention` +* `get-dispatch-rate` +* `set-dispatch-rate` +* `remove-dispatch-rate` +* `get-max-unacked-messages-per-subscription` +* `set-max-unacked-messages-per-subscription` +* `remove-max-unacked-messages-per-subscription` +* `get-max-unacked-messages-per-consumer` +* `set-max-unacked-messages-per-consumer` +* `remove-max-unacked-messages-per-consumer` +* `get-delayed-delivery` +* `set-delayed-delivery` +* `remove-delayed-delivery` +* `get-max-producers` +* `set-max-producers` +* `remove-max-producers` +* `get-max-consumers` +* `set-max-consumers` +* `remove-max-consumers` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `remove-compaction-threshold` +* `get-offload-policies` +* `set-offload-policies` +* `remove-offload-policies` +* `get-inactive-topic-policies` +* `set-inactive-topic-policies` +* `remove-inactive-topic-policies` +* `set-max-subscriptions` +* `get-max-subscriptions` +* `remove-max-subscriptions` + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage + +``` + +$ pulsar-admin topics compact persistent://tenant/namespace/topic + +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage + +```bash + +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage + +```bash + +$ pulsar-admin topics offload persistent://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage + +```bash + +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `create-missed-partitions` +Try to create partitions for partitioned topic. The partitions of partition topic has to be created, +can be used by repair partitions when topic auto creation is disabled + +Usage + +```bash + +$ pulsar-admin topics create-missed-partitions persistent://tenant/namespace/topic + +``` + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage + +```bash + +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} + +``` + +### `create` +Creates a non-partitioned topic. A non-partitioned topic must explicitly be created by the user if allowAutoTopicCreation or createIfMissing is disabled. + +:::note + +By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +For more information about these two parameters, see [here](reference-configuration.md#broker). + +::: + +Usage + +```bash + +$ pulsar-admin topics create {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage + +```bash + +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic + +``` + +### `update-partitioned-topic` +Update existing non-global partitioned topic. New updating number of partitions must be greater than existing number of partitions. + +Usage + +```bash + +$ pulsar-admin topics update-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `list-partitioned-topics` +Get the list of partitioned topics under a namespace. + +Usage + +```bash + +$ pulsar-admin topics list-partitioned-topics tenant/namespace + +``` + +### `list` +Get the list of topics under a namespace + +Usage + +``` + +$ pulsar-admin topics list tenant/cluster/namespace + +``` + +### `terminate` +Terminate a persistent topic (disallow further messages from being published on the topic) + +Usage + +```bash + +$ pulsar-admin topics terminate persistent://tenant/namespace/topic + +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a destination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage + +```bash + +$ pulsar-admin topics permissions topic + +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage + +```bash + +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage + +```bash + +$ pulsar-admin topics revoke-permission topic + +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage + +```bash + +$ pulsar-admin topics lookup topic + +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage + +```bash + +$ pulsar-admin topics bundle-range topic + +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage + +```bash + +$ pulsar-admin topics delete topic + +``` + +### `unload` +Unload a topic + +Usage + +```bash + +$ pulsar-admin topics unload topic + +``` + +### `create-subscription` +Create a new subscription on a topic. + +Usage + +```bash + +$ pulsar-admin topics create-subscription [options] persistent://tenant/namespace/topic + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-m`, `--messageId`|messageId where to create the subscription. It can be either 'latest', 'earliest' or (ledgerId:entryId)|latest| +|`-s`, `--subscription`|Subscription to reset position on|| + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage + +```bash + +$ pulsar-admin topics subscriptions topic + +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage + +```bash + +$ pulsar-admin topics unsubscribe topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| +|`-f`, `--force`|Disconnect and close all consumers and delete subscription forcefully|false| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics stats topic + +``` + +:::note + +The unit of `storageSize` and `averageMsgSize` is Byte. + +::: + +### `stats-internal` +Get the internal stats for the topic + +Usage + +```bash + +$ pulsar-admin topics stats-internal topic + +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage + +```bash + +$ pulsar-admin topics info-internal topic + +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + +### `partitioned-stats-internal` +Get the internal stats for the partitioned topic and its connected producers and consumers. All the rates are computed over a 1 minute window and are relative the last completed 1 minute period. + +Usage + +```bash + +$ pulsar-admin topics partitioned-stats-internal topic + +``` + +### `skip` +Skip some messages for the subscription + +Usage + +```bash + +$ pulsar-admin topics skip topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `clear-backlog` +Clear backlog (skip all the messages) for the subscription + +Usage + +```bash + +$ pulsar-admin topics clear-backlog topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage + +```bash + +$ pulsar-admin topics expire-messages topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage + +```bash + +$ pulsar-admin topics expire-messages-all-subscriptions topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage + +```bash + +$ pulsar-admin topics peek-messages topic options + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to a position that is closest to timestamp or messageId. + +Usage + +```bash + +$ pulsar-admin topics reset-cursor topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time in minutes to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| +|`-m`, `--messageId`| The messageId to reset back to (ledgerId:entryId). || + +### `get-message-by-id` +Get message by ledger id and entry id + +Usage + +```bash + +$ pulsar-admin topics get-message-by-id topic options + +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-l`, `--ledgerId`|The ledger id |0| +|`-e`, `--entryId`|The entry id |0| + +### `last-message-id` +Get the last commit message ID of the topic. + +Usage + +```bash + +$ pulsar-admin topics last-message-id persistent://tenant/namespace/topic + +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-backlog-quotas tenant/namespace/topic + +``` + +### `set-backlog-quota` +Set a backlog quota policy for a topic. + +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-lt`, `--limitTime`|Time limit in second, non-positive number for disabling time limit. (for example 3600 for 1 hour)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| +|`-t`, `--type`|Backlog quota type to set. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin topics set-backlog-quota tenant/namespace/topic options + +``` + +Example + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limit 2G \ +--policy producer_request_hold + +``` + +```bash + +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns/my-topic \ +--limitTime 3600 \ +--policy producer_request_hold \ +--type message_age + +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a topic. + +|Flag|Description|Default| +|---|---|---| +|`-t`, `--type`|Backlog quota type to remove. The valid options are: `destination_storage`, `message_age` |destination_storage| + +Usage + +```bash + +$ pulsar-admin topics remove-backlog-quota tenant/namespace/topic + +``` + +### `get-persistence` +Get the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-persistence tenant/namespace/topic + +``` + +### `set-persistence` +Set the persistence policies for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-persistence tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-e`, `--bookkeeper-ensemble`|Number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-a`, `--bookkeeper-ack-quorum`|Number of acks (guaranteed copies) to wait for each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + +### `remove-persistence` +Remove the persistence policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-persistence tenant/namespace/topic + +``` + +### `get-message-ttl` +Get the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-message-ttl tenant/namespace/topic + +``` + +### `set-message-ttl` +Set the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-message-ttl tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL for a topic in second, allowed range from 1 to `Integer.MAX_VALUE` |0| + +### `remove-message-ttl` +Remove the message TTL for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-message-ttl tenant/namespace/topic + +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified topic.|false| +|`--disable`, `-d`|Disable message deduplication on the specified topic.|false| + +### `get-deduplication` +Get a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics get-deduplication tenant/namespace/topic + +``` + +### `set-deduplication` +Set a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics set-deduplication tenant/namespace/topic options + +``` + +### `remove-deduplication` +Remove a deduplication policy for a topic. + +Usage + +```bash + +$ pulsar-admin topics remove-deduplication tenant/namespace/topic + +``` + +## `tenants` +Operations for managing tenants + +Usage + +```bash + +$ pulsar-admin tenants subcommand + +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage + +```bash + +$ pulsar-admin tenants list + +``` + +### `get` +Gets the configuration of a tenant + +Usage + +```bash + +$ pulsar-admin tenants get tenant-name + +``` + +### `create` +Creates a new tenant + +Usage + +```bash + +$ pulsar-admin tenants create tenant-name options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage + +```bash + +$ pulsar-admin tenants update tenant-name options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage + +```bash + +$ pulsar-admin tenants delete tenant-name + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-f`, `--force`|Delete a tenant forcefully by deleting all namespaces under it.|false| + + +## `resource-quotas` +Operations for managing resource quotas + +Usage + +```bash + +$ pulsar-admin resource-quotas subcommand + +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas get options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage + +```bash + +$ pulsar-admin resource-quotas set options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specified namespace bundle's resource quota to a default value. + +Usage + +```bash + +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage + +``` + +$ pulsar-admin schemas subcommand + +``` + +Subcommands +* `upload` +* `delete` +* `get` +* `extract` + + +### `upload` +Upload the schema definition for a topic + +Usage + +```bash + +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage + +```bash + +$ pulsar-admin schemas delete persistent://tenant/namespace/topic + +``` + +### `get` +Retrieve the schema definition associated with a topic (at a given version if version is supplied). + +Usage + +```bash + +$ pulsar-admin schemas get persistent://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrieve for a topic.|| + +### `extract` +Provide the schema definition for a topic via Java class name contained in a JAR file + +Usage + +```bash + +$ pulsar-admin schemas extract persistent://tenant/namespace/topic options + +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--classname`|The Java class name|| +|`-j`, `--jar`|A path to the JAR file which contains the above Java class|| +|`-t`, `--type`|The type of the schema (avro or json)|| diff --git a/site2/website/versioned_docs/version-2.9.x/reference-rest-api-overview.md b/site2/website/versioned_docs/version-2.9.x/reference-rest-api-overview.md new file mode 100644 index 0000000000000..4bdcf23483a2b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-rest-api-overview.md @@ -0,0 +1,18 @@ +--- +id: reference-rest-api-overview +title: Pulsar REST APIs +sidebar_label: "Pulsar REST APIs" +--- + +A REST API (also known as RESTful API, REpresentational State Transfer Application Programming Interface) is a set of definitions and protocols for building and integrating application software, using HTTP requests to GET, PUT, POST, and DELETE data following the REST standards. In essence, REST API is a set of remote calls using standard methods to request and return data in a specific format between two systems. + +Pulsar provides a variety of REST APIs that enable you to interact with Pulsar to retrieve information or perform an action. + +| REST API category | Description | +| --- | --- | +| [Admin](https://pulsar.apache.org/admin-rest-api/?version=master) | REST APIs for administrative operations.| +| [Functions](https://pulsar.apache.org/functions-rest-api/?version=master) | REST APIs for function-specific operations.| +| [Sources](https://pulsar.apache.org/source-rest-api/?version=master) | REST APIs for source-specific operations.| +| [Sinks](https://pulsar.apache.org/sink-rest-api/?version=master) | REST APIs for sink-specific operations.| +| [Packages](https://pulsar.apache.org/packages-rest-api/?version=master) | REST APIs for package-specific operations. A package can be a group of functions, sources, and sinks.| + diff --git a/site2/website/versioned_docs/version-2.9.x/reference-terminology.md b/site2/website/versioned_docs/version-2.9.x/reference-terminology.md new file mode 100644 index 0000000000000..e5099141c3231 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/reference-terminology.md @@ -0,0 +1,176 @@ +--- +id: reference-terminology +title: Pulsar Terminology +sidebar_label: "Terminology" +original_id: reference-terminology +--- + +Here is a glossary of terms related to Apache Pulsar: + +### Concepts + +#### Pulsar + +Pulsar is a distributed messaging system originally created by Yahoo but now under the stewardship of the Apache Software Foundation. + +#### Message + +Messages are the basic unit of Pulsar. They're what [producers](#producer) publish to [topics](#topic) +and what [consumers](#consumer) then consume from topics. + +#### Topic + +A named channel used to pass messages published by [producers](#producer) to [consumers](#consumer) who +process those [messages](#message). + +#### Partitioned Topic + +A topic that is served by multiple Pulsar [brokers](#broker), which enables higher throughput. + +#### Namespace + +A grouping mechanism for related [topics](#topic). + +#### Namespace Bundle + +A virtual group of [topics](#topic) that belong to the same [namespace](#namespace). A namespace bundle +is defined as a range between two 32-bit hashes, such as 0x00000000 and 0xffffffff. + +#### Tenant + +An administrative unit for allocating capacity and enforcing an authentication/authorization scheme. + +#### Subscription + +A lease on a [topic](#topic) established by a group of [consumers](#consumer). Pulsar has four subscription +modes (exclusive, shared, failover and key_shared). + +#### Pub-Sub + +A messaging pattern in which [producer](#producer) processes publish messages on [topics](#topic) that +are then consumed (processed) by [consumer](#consumer) processes. + +#### Producer + +A process that publishes [messages](#message) to a Pulsar [topic](#topic). + +#### Consumer + +A process that establishes a subscription to a Pulsar [topic](#topic) and processes messages published +to that topic by [producers](#producer). + +#### Reader + +Pulsar readers are message processors much like Pulsar [consumers](#consumer) but with two crucial differences: + +- you can specify *where* on a topic readers begin processing messages (consumers always begin with the latest + available unacked message); +- readers don't retain data or acknowledge messages. + +#### Cursor + +The subscription position for a [consumer](#consumer). + +#### Acknowledgment (ack) + +A message sent to a Pulsar broker by a [consumer](#consumer) that a message has been successfully processed. +An acknowledgement (ack) is Pulsar's way of knowing that the message can be deleted from the system; +if no acknowledgement, then the message will be retained until it's processed. + +#### Negative Acknowledgment (nack) + +When an application fails to process a particular message, it can send a "negative ack" to Pulsar +to signal that the message should be replayed at a later timer. (By default, failed messages are +replayed after a 1 minute delay). Be aware that negative acknowledgment on ordered subscription types, +such as Exclusive, Failover and Key_Shared, can cause failed messages to arrive consumers out of the original order. + +#### Unacknowledged + +A message that has been delivered to a consumer for processing but not yet confirmed as processed by the consumer. + +#### Retention Policy + +Size and time limits that you can set on a [namespace](#namespace) to configure retention of [messages](#message) +that have already been [acknowledged](#acknowledgement-ack). + +#### Multi-Tenancy + +The ability to isolate [namespaces](#namespace), specify quotas, and configure authentication and authorization +on a per-[tenant](#tenant) basis. + +#### Failure Domain + +A logical domain under a Pulsar cluster. Each logical domain contains a pre-configured list of brokers. + +#### Anti-affinity Namespaces + +A group of namespaces that have anti-affinity to each other. + +### Architecture + +#### Standalone + +A lightweight Pulsar broker in which all components run in a single Java Virtual Machine (JVM) process. Standalone +clusters can be run on a single machine and are useful for development purposes. + +#### Cluster + +A set of Pulsar [brokers](#broker) and [BookKeeper](#bookkeeper) servers (aka [bookies](#bookie)). +Clusters can reside in different geographical regions and replicate messages to one another +in a process called [geo-replication](#geo-replication). + +#### Instance + +A group of Pulsar [clusters](#cluster) that act together as a single unit. + +#### Geo-Replication + +Replication of messages across Pulsar [clusters](#cluster), potentially in different datacenters +or geographical regions. + +#### Configuration Store + +Pulsar's configuration store (previously known as configuration store) is a ZooKeeper quorum that +is used for configuration-specific tasks. A multi-cluster Pulsar installation requires just one +configuration store across all [clusters](#cluster). + +#### Topic Lookup + +A service provided by Pulsar [brokers](#broker) that enables connecting clients to automatically determine +which Pulsar [cluster](#cluster) is responsible for a [topic](#topic) (and thus where message traffic for +the topic needs to be routed). + +#### Service Discovery + +A mechanism provided by Pulsar that enables connecting clients to use just a single URL to interact +with all the [brokers](#broker) in a [cluster](#cluster). + +#### Broker + +A stateless component of Pulsar [clusters](#cluster) that runs two other components: an HTTP server +exposing a REST interface for administration and topic lookup and a [dispatcher](#dispatcher) that +handles all message transfers. Pulsar clusters typically consist of multiple brokers. + +#### Dispatcher + +An asynchronous TCP server used for all data transfers in-and-out a Pulsar [broker](#broker). The Pulsar +dispatcher uses a custom binary protocol for all communications. + +### Storage + +#### BookKeeper + +[Apache BookKeeper](http://bookkeeper.apache.org/) is a scalable, low-latency persistent log storage +service that Pulsar uses to store data. + +#### Bookie + +Bookie is the name of an individual BookKeeper server. It is effectively the storage server of Pulsar. + +#### Ledger + +An append-only data structure in [BookKeeper](#bookkeeper) that is used to persistently store messages in Pulsar [topics](#topic). + +### Functions + +Pulsar Functions are lightweight functions that can consume messages from Pulsar topics, apply custom processing logic, and, if desired, publish results to topics. diff --git a/site2/website/versioned_docs/version-2.9.x/schema-evolution-compatibility.md b/site2/website/versioned_docs/version-2.9.x/schema-evolution-compatibility.md new file mode 100644 index 0000000000000..3e78429df69da --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/schema-evolution-compatibility.md @@ -0,0 +1,201 @@ +--- +id: schema-evolution-compatibility +title: Schema evolution and compatibility +sidebar_label: "Schema evolution and compatibility" +original_id: schema-evolution-compatibility +--- + +Normally, schemas do not stay the same over a long period of time. Instead, they undergo evolutions to satisfy new needs. + +This chapter examines how Pulsar schema evolves and what Pulsar schema compatibility check strategies are. + +## Schema evolution + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +Each `SchemaInfo` stored with a topic has a version. The version is used to manage the schema changes happening within a topic. + +The message produced with `SchemaInfo` is tagged with a schema version. When a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and use the correct schema information to deserialize data. + +### What is schema evolution? + +Schemas store the details of attributes and types. To satisfy new business requirements, you need to update schemas inevitably over time, which is called **schema evolution**. + +Any schema changes affect downstream consumers. Schema evolution ensures that the downstream consumers can seamlessly handle data encoded with both old schemas and new schemas. + +### How Pulsar schema should evolve? + +The answer is Pulsar schema compatibility check strategy. It determines how schema compares old schemas with new schemas in topics. + +For more information, see [Schema compatibility check strategy](#schema-compatibility-check-strategy). + +### How does Pulsar support schema evolution? + +1. When a producer/consumer/reader connects to a broker, the broker deploys the schema compatibility checker configured by `schemaRegistryCompatibilityCheckers` to enforce schema compatibility check. + + The schema compatibility checker is one instance per schema type. + + Currently, Avro and JSON have their own compatibility checkers, while all the other schema types share the default compatibility checker which disables schema evolution. + +2. The producer/consumer/reader sends its client `SchemaInfo` to the broker. + +3. The broker knows the schema type and locates the schema compatibility checker for that type. + +4. The broker uses the checker to check if the `SchemaInfo` is compatible with the latest schema of the topic by applying its compatibility check strategy. + + Currently, the compatibility check strategy is configured at the namespace level and applied to all the topics within that namespace. + +## Schema compatibility check strategy + +Pulsar has 8 schema compatibility check strategies, which are summarized in the following table. + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Changes allowed | Check against which schema | Upgrade first | +| --- | --- | --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | All changes are allowed | All previous versions | Any order | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution. | All changes are disabled | None | None | +| `BACKWARD` | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. |
  • Add optional fields
  • Delete fields
  • | Latest version | Consumers | +| `BACKWARD_TRANSITIVE` | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. |
  • Add optional fields
  • Delete fields
  • | All previous versions | Consumers | +| `FORWARD` | Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | Latest version | Producers | +| `FORWARD_TRANSITIVE` | Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. |
  • Add fields
  • Delete optional fields
  • | All previous versions | Producers | +| `FULL` | Backward and forward compatible between the schema V3 and V2. |
  • Modify optional fields
  • | Latest version | Any order | +| `FULL_TRANSITIVE` | Backward and forward compatible among the schema V3, V2, and V1. |
  • Modify optional fields
  • | All previous versions | Any order | + +### ALWAYS_COMPATIBLE and ALWAYS_INCOMPATIBLE + +| Compatibility check strategy | Definition | Note | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Disable schema compatibility check. | None | +| `ALWAYS_INCOMPATIBLE` | Disable schema evolution, that is, any schema change is rejected. |
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • | + +#### Example + +* Example 1 + + In some situations, an application needs to store events of several different types in the same Pulsar topic. + + In particular, when developing a data model in an `Event Sourcing` style, you might have several kinds of events that affect the state of an entity. + + For example, for a user entity, there are `userCreated`, `userAddressChanged` and `userEnquiryReceived` events. The application requires that those events are always read in the same order. + + Consequently, those events need to go in the same Pulsar partition to maintain order. This application can use `ALWAYS_COMPATIBLE` to allow different kinds of events co-exist in the same topic. + +* Example 2 + + Sometimes we also make incompatible changes. + + For example, you are modifying a field type from `string` to `int`. + + In this case, you need to: + + * Upgrade all producers and consumers to the new schema versions at the same time. + + * Optionally, create a new topic and start migrating applications to use the new topic and the new schema, avoiding the need to handle two incompatible versions in the same topic. + +### BACKWARD and BACKWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`BACKWARD` | Consumers using the new schema can process data written by producers using the **last schema**. | The consumers using the schema V3 can process data written by producers using the schema V3 or V2. | +`BACKWARD_TRANSITIVE` | Consumers using the new schema can process data written by producers using **all previous schemas**. | The consumers using the schema V3 can process data written by producers using the schema V3, V2, or V1. | + +#### Example + +* Example 1 + + Remove a field. + + A consumer constructed to process events without one field can process events written with the old schema containing the field, and the consumer will ignore that field. + +* Example 2 + + You want to load all Pulsar data into a Hive data warehouse and run SQL queries against the data. + + Same SQL queries must continue to work even the data is changed. To support it, you can evolve the schemas using the `BACKWARD` strategy. + +### FORWARD and FORWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`FORWARD` | Consumers using the **last schema** can process data written by producers using a new schema, even though they may not be able to use the full capabilities of the new schema. | The consumers using the schema V3 or V2 can process data written by producers using the schema V3. | +`FORWARD_TRANSITIVE` | Consumers using **all previous schemas** can process data written by producers using a new schema. | The consumers using the schema V3, V2, or V1 can process data written by producers using the schema V3. + +#### Example + +* Example 1 + + Add a field. + + In most data formats, consumers written to process events without new fields can continue doing so even when they receive new events containing new fields. + +* Example 2 + + If a consumer has an application logic tied to a full version of a schema, the application logic may not be updated instantly when the schema evolves. + + In this case, you need to project data with a new schema onto an old schema that the application understands. + + Consequently, you can evolve the schemas using the `FORWARD` strategy to ensure that the old schema can process data encoded with the new schema. + +### FULL and FULL_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | Note | +| --- | --- | --- | --- | +| `FULL` | Schemas are both backward and forward compatible, which means: Consumers using the last schema can process data written by producers using the new schema. AND Consumers using the new schema can process data written by producers using the last schema. | Consumers using the schema V3 can process data written by producers using the schema V3 or V2. AND Consumers using the schema V3 or V2 can process data written by producers using the schema V3. |
  • For Avro and JSON, the default schema compatibility check strategy is `FULL`.
  • For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`.
  • | +| `FULL_TRANSITIVE` | The new schema is backward and forward compatible with all previously registered schemas. | Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. AND Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. | None | + +#### Example + +In some data formats, for example, Avro, you can define fields with default values. Consequently, adding or removing a field with a default value is a fully compatible change. + +## Schema verification + +When a producer or a consumer tries to connect to a topic, a broker performs some checks to verify a schema. + +### Producer + +When a producer tries to connect to a topic (suppose ignore the schema auto creation), a broker does the following checks: + +* Check if the schema carried by the producer exists in the schema registry or not. + + * If the schema is already registered, then the producer is connected to a broker and produce messages with that schema. + + * If the schema is not registered, then Pulsar verifies if the schema is allowed to be registered based on the configured compatibility check strategy. + +### Consumer +When a consumer tries to connect to a topic, a broker checks if a carried schema is compatible with a registered schema based on the configured schema compatibility check strategy. + +| Compatibility check strategy | Check logic | +| --- | --- | +| `ALWAYS_COMPATIBLE` | All pass | +| `ALWAYS_INCOMPATIBLE` | No pass | +| `BACKWARD` | Can read the last schema | +| `BACKWARD_TRANSITIVE` | Can read all schemas | +| `FORWARD` | Can read the last schema | +| `FORWARD_TRANSITIVE` | Can read the last schema | +| `FULL` | Can read the last schema | +| `FULL_TRANSITIVE` | Can read all schemas | + +## Order of upgrading clients + +The order of upgrading client applications is determined by the compatibility check strategy. + +For example, the producers using schemas to write data to Pulsar and the consumers using schemas to read data from Pulsar. + +| Compatibility check strategy | Upgrade first | Description | +| --- | --- | --- | +| `ALWAYS_COMPATIBLE` | Any order | The compatibility check is disabled. Consequently, you can upgrade the producers and consumers in **any order**. | +| `ALWAYS_INCOMPATIBLE` | None | The schema evolution is disabled. | +|
  • `BACKWARD`
  • `BACKWARD_TRANSITIVE`
  • | Consumers | There is no guarantee that consumers using the old schema can read data produced using the new schema. Consequently, **upgrade all consumers first**, and then start producing new data. | +|
  • `FORWARD`
  • `FORWARD_TRANSITIVE`
  • | Producers | There is no guarantee that consumers using the new schema can read data produced using the old schema. Consequently, **upgrade all producers first**
  • to use the new schema and ensure that the data already produced using the old schemas are not available to consumers, and then upgrade the consumers.
  • | +|
  • `FULL`
  • `FULL_TRANSITIVE`
  • | Any order | There is no guarantee that consumers using the old schema can read data produced using the new schema and consumers using the new schema can read data produced using the old schema. Consequently, you can upgrade the producers and consumers in **any order**. | + + + + diff --git a/site2/website/versioned_docs/version-2.9.x/schema-get-started.md b/site2/website/versioned_docs/version-2.9.x/schema-get-started.md new file mode 100644 index 0000000000000..afacb0fa51f2e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/schema-get-started.md @@ -0,0 +1,102 @@ +--- +id: schema-get-started +title: Get started +sidebar_label: "Get started" +original_id: schema-get-started +--- + +This chapter introduces Pulsar schemas and explains why they are important. + +## Schema Registry + +Type safety is extremely important in any application built around a message bus like Pulsar. + +Producers and consumers need some kind of mechanism for coordinating types at the topic level to avoid various potential problems arise. For example, serialization and deserialization issues. + +Applications typically adopt one of the following approaches to guarantee type safety in messaging. Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +#### Note +> +> Currently, the Pulsar schema registry is only available for the [Java client](client-libraries-java.md), [CGo client](client-libraries-cgo.md), [Python client](client-libraries-python.md), and [C++ client](client-libraries-cpp.md). + +### Client-side approach + +Producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. + +If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as moisture sensor readings. + +Producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. + +### Server-side approach + +Producers and consumers inform the system which data types can be transmitted via the topic. + +With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +## Why use schema + +When a schema is enabled, Pulsar does parse data, it takes bytes as inputs and sends bytes as outputs. While data has meaning beyond bytes, you need to parse data and might encounter parse exceptions which mainly occur in the following situations: + +* The field does not exist + +* The field type has changed (for example, `string` is changed to `int`) + +There are a few methods to prevent and overcome these exceptions, for example, you can catch exceptions when parsing errors, which makes code hard to maintain; or you can adopt a schema management system to perform schema evolution, not to break downstream applications, and enforces type safety to max extend in the language you are using, the solution is Pulsar Schema. + +Pulsar schema enables you to use language-specific types of data when constructing and handling messages from simple types like `string` to more complex application-specific types. + +**Example** + +You can use the _User_ class to define the messages sent to Pulsar topics. + +``` + +public class User { + String name; + int age; +} + +``` + +When constructing a producer with the _User_ class, you can specify a schema or not as below. + +### Without schema + +If you construct a producer without specifying a schema, then the producer can only produce messages of type `byte[]`. If you have a POJO class, you need to serialize the POJO into bytes before sending messages. + +**Example** + +``` + +Producer producer = client.newProducer() + .topic(topic) + .create(); +User user = new User("Tom", 28); +byte[] message = … // serialize the `user` by yourself; +producer.send(message); + +``` + +### With schema + +If you construct a producer with specifying a schema, then you can send a class to a topic directly without worrying about how to serialize POJOs into bytes. + +**Example** + +This example constructs a producer with the _JSONSchema_, and you can send the _User_ class to topics directly without worrying about how to serialize it into bytes. + +``` + +Producer producer = client.newProducer(JSONSchema.of(User.class)) + .topic(topic) + .create(); +User user = new User("Tom", 28); +producer.send(user); + +``` + +### Summary + +When constructing a producer with a schema, you do not need to serialize messages into bytes, instead Pulsar schema does this job in the background. diff --git a/site2/website/versioned_docs/version-2.9.x/schema-manage.md b/site2/website/versioned_docs/version-2.9.x/schema-manage.md new file mode 100644 index 0000000000000..c588aae619eee --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/schema-manage.md @@ -0,0 +1,639 @@ +--- +id: schema-manage +title: Manage schema +sidebar_label: "Manage schema" +original_id: schema-manage +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This guide demonstrates the ways to manage schemas: + +* Automatically + + * [Schema AutoUpdate](#schema-autoupdate) + +* Manually + + * [Schema manual management](#schema-manual-management) + + * [Custom schema storage](#custom-schema-storage) + +## Schema AutoUpdate + +If a schema passes the schema compatibility check, Pulsar producer automatically updates this schema to the topic it produces by default. + +### AutoUpdate for producer + +For a producer, the `AutoUpdate` happens in the following cases: + +* If a **topic doesn’t have a schema**, Pulsar registers a schema automatically. + +* If a **topic has a schema**: + + * If a **producer doesn’t carry a schema**: + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **disabled** in the namespace to which the topic belongs, the producer is allowed to connect to the topic and produce data. + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **enabled** in the namespace to which the topic belongs, the producer is rejected and disconnected. + + * If a **producer carries a schema**: + + A broker performs the compatibility check based on the configured compatibility check strategy of the namespace to which the topic belongs. + + * If the schema is registered, a producer is connected to a broker. + + * If the schema is not registered: + + * If `isAllowAutoUpdateSchema` sets to **false**, the producer is rejected to connect to a broker. + + * If `isAllowAutoUpdateSchema` sets to **true**: + + * If the schema passes the compatibility check, then the broker registers a new schema automatically for the topic and the producer is connected. + + * If the schema does not pass the compatibility check, then the broker does not register a schema and the producer is rejected to connect to a broker. + +![AutoUpdate Producer](/assets/schema-producer.png) + +### AutoUpdate for consumer + +For a consumer, the `AutoUpdate` happens in the following cases: + +* If a **consumer connects to a topic without a schema** (which means the consumer receiving raw bytes), the consumer can connect to the topic successfully without doing any compatibility check. + +* If a **consumer connects to a topic with a schema**. + + * If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + + * If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +![AutoUpdate Consumer](/assets/schema-consumer.png) + + +### Manage AutoUpdate strategy + +You can use the `pulsar-admin` command to manage the `AutoUpdate` strategy as below: + +* [Enable AutoUpdate](#enable-autoupdate) + +* [Disable AutoUpdate](#disable-autoupdate) + +* [Adjust compatibility](#adjust-compatibility) + +#### Enable AutoUpdate + +To enable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --enable tenant/namespace + +``` + +#### Disable AutoUpdate + +To disable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --disable tenant/namespace + +``` + +Once the `AutoUpdate` is disabled, you can only register a new schema using the `pulsar-admin` command. + +#### Adjust compatibility + +To adjust the schema compatibility level on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-compatibility-strategy --compatibility tenant/namespace + +``` + +### Schema validation + +By default, `schemaValidationEnforced` is **disabled** for producers: + +* This means a producer without a schema can produce any kind of messages to a topic with schemas, which may result in producing trash data to the topic. + +* This allows non-java language clients that don’t support schema can produce messages to a topic with schemas. + +However, if you want a stronger guarantee on the topics with schemas, you can enable `schemaValidationEnforced` across the whole cluster or on a per-namespace basis. + +#### Enable schema validation + +To enable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --enable tenant/namespace + +``` + +#### Disable schema validation + +To disable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash + +bin/pulsar-admin namespaces set-schema-validation-enforce --disable tenant/namespace + +``` + +## Schema manual management + +To manage schemas, you can use one of the following methods. + +| Method | Description | +| --- | --- | +| **Admin CLI**
  • | You can use the `pulsar-admin` tool to manage Pulsar schemas, brokers, clusters, sources, sinks, topics, tenants and so on. For more information about how to use the `pulsar-admin` tool, see [here](reference-pulsar-admin.md). | +| **REST API**
  • | Pulsar exposes schema related management API in Pulsar’s admin RESTful API. You can access the admin RESTful endpoint directly to manage schemas. For more information about how to use the Pulsar REST API, see [here](http://pulsar.apache.org/admin-rest-api/). | +| **Java Admin API**
  • | Pulsar provides Java admin library. | + +### Upload a schema + +To upload (register) a new schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `upload` subcommand. + +```bash + +$ pulsar-admin schemas upload --filename + +``` + +The `schema-definition-file` is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The `schema-definition-file` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here are examples of the `schema-definition-file` for a JSON schema. + +**Example 1** + +```json + +{ + "type": "JSON", + "schema": "{\"type\":\"record\",\"name\":\"User\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"file1\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"file2\",\"type\":\"string\",\"default\":null},{\"name\":\"file3\",\"type\":[\"null\",\"string\"],\"default\":\"dfdf\"}]}", + "properties": {} +} + +``` + +**Example 2** + +```json + +{ + "type": "STRING", + "schema": "", + "properties": { + "key1": "value1" + } +} + +``` + +
    + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/uploadSchema?version=@pulsar:version_number@} + +The post payload is in JSON format. + +```json + +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} + +``` + +The post payload includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +void createSchema(String topic, PostSchemaPayload schemaPayload) + +``` + +The `PostSchemaPayload` includes the following fields: + +| Field | Description | +| --- | --- | +| `type` | The schema type. | +| `schema` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `PostSchemaPayload`: + +```java + +PulsarAdmin admin = …; + +PostSchemaPayload payload = new PostSchemaPayload(); +payload.setType("INT8"); +payload.setSchema(""); + +admin.createSchema("my-tenant/my-ns/my-topic", payload); + +``` + +
    + +
    +```` + +### Get a schema (latest) + +To get the latest schema for a topic, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get + +{ + "version": 0, + "type": "String", + "timestamp": 0, + "data": "string", + "properties": { + "property1": "string", + "property2": "string" + } +} + +``` + + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic"); + +``` + +
    + +
    +```` + +### Get a schema (specific) + +To get a specific version of a schema, you can use one of the following methods. + +````mdx-code-block + + + + +Use the `get` subcommand. + +```bash + +$ pulsar-admin schemas get --version= + +``` + + + + +Send a `GET` request to a schema endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema/:version|operation/getSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} + +``` + +The response includes the following fields: + +| Field | Description | +| --- | --- | +| `version` | The schema version, which is a long number. | +| `type` | The schema type. | +| `timestamp` | The timestamp of creating this version of schema. | +| `data` | The schema definition data, which is encoded in UTF 8 charset.
  • If the schema is a
  • **primitive**
  • schema, this field should be blank.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition.
  • | +| `properties` | The additional properties associated with the schema. | + +
    + + +```java + +SchemaInfo createSchema(String topic, long version) + +``` + +The `SchemaInfo` includes the following fields: + +| Field | Description | +| --- | --- | +| `name` | The schema name. | +| `type` | The schema type. | +| `schema` | A byte array of the schema definition data, which is encoded in UTF 8.
  • If the schema is a
  • **primitive**
  • schema, this byte array should be empty.
  • If the schema is a
  • **struct**
  • schema, this field should be a JSON string of the Avro schema definition converted to a byte array.
  • | +| `properties` | The additional properties associated with the schema. | + +Here is an example of `SchemaInfo`: + +```java + +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic", 1L); + +``` + +
    + +
    +```` + +### Extract a schema + +To provide a schema via a topic, you can use the following method. + +````mdx-code-block + + + + +Use the `extract` subcommand. + +```bash + +$ pulsar-admin schemas extract --classname --jar --type + +``` + + + + +```` + +### Delete a schema + +To delete a schema for a topic, you can use one of the following methods. + +:::note + +In any case, the **delete** action deletes **all versions** of a schema registered for a topic. + +::: + +````mdx-code-block + + + + +Use the `delete` subcommand. + +```bash + +$ pulsar-admin schemas delete + +``` + + + + +Send a `DELETE` request to a schema endpoint: {@inject: endpoint|DELETE|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/deleteSchema?version=@pulsar:version_number@} + +Here is an example of a response, which is returned in JSON format. + +```json + +{ + "version": "", +} + +``` + +The response includes the following field: + +Field | Description | +---|---| +`version` | The schema version, which is a long number. | + + + + +```java + +void deleteSchema(String topic) + +``` + +Here is an example of deleting a schema. + +```java + +PulsarAdmin admin = …; + +admin.deleteSchema("my-tenant/my-ns/my-topic"); + +``` + + + + +```` + +## Custom schema storage + +By default, Pulsar stores various data types of schemas in [Apache BookKeeper](https://bookkeeper.apache.org) deployed alongside Pulsar. + +However, you can use another storage system if needed. + +### Implement + +To use a non-default (non-BookKeeper) storage system for Pulsar schemas, you need to implement the following Java interfaces: + +* [SchemaStorage interface](#schemastorage-interface) + +* [SchemaStorageFactory interface](#schemastoragefactory-interface) + +#### SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java + +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage** implementation, see [BookKeeperSchemaStorage](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +::: + +#### SchemaStorageFactory interface + +The `SchemaStorageFactory` interface has the following method: + +```java + +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} + +``` + +:::tip + +For a complete example of **schema storage factory** implementation, see [BookKeeperSchemaStorageFactory](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +::: + +### Deploy + +To use your custom schema storage implementation, perform the following steps. + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. + +2. Add the JAR file to the `lib` folder in your Pulsar binary or source distribution. + +3. Change the `schemaRegistryStorageClassName` configuration in `broker.conf` to your custom factory class. + +4. Start Pulsar. diff --git a/site2/website/versioned_docs/version-2.9.x/schema-understand.md b/site2/website/versioned_docs/version-2.9.x/schema-understand.md new file mode 100644 index 0000000000000..55bc662c66633 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/schema-understand.md @@ -0,0 +1,576 @@ +--- +id: schema-understand +title: Understand schema +sidebar_label: "Understand schema" +original_id: schema-understand +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +This chapter explains the basic concepts of Pulsar schema, focuses on the topics of particular importance, and provides additional background. + +## SchemaInfo + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +The `SchemaInfo` is stored and enforced on a per-topic basis and cannot be stored at the namespace or tenant level. + +A `SchemaInfo` consists of the following fields: + +| Field | Description | +| --- | --- | +| `name` | Schema name (a string). | +| `type` | Schema type, which determines how to interpret the schema data.
  • Predefined schema: see [here](schema-understand.md#schema-type).
  • Customized schema: it is left as an empty string.
  • | +| `schema`(`payload`) | Schema data, which is a sequence of 8-bit unsigned bytes and schema-type specific. | +| `properties` | It is a user defined properties as a string/string map. Applications can use this bag for carrying any application specific logics. Possible properties might be the Git hash associated with the schema, an environment string like `dev` or `prod`. | + +**Example** + +This is the `SchemaInfo` of a string. + +```json + +{ + "name": "test-string-schema", + "type": "STRING", + "schema": "", + "properties": {} +} + +``` + +## Schema type + +Pulsar supports various schema types, which are mainly divided into two categories: + +* Primitive type + +* Complex type + +### Primitive type + +Currently, Pulsar supports the following primitive types: + +| Primitive Type | Description | +|---|---| +| `BOOLEAN` | A binary value | +| `INT8` | A 8-bit signed integer | +| `INT16` | A 16-bit signed integer | +| `INT32` | A 32-bit signed integer | +| `INT64` | A 64-bit signed integer | +| `FLOAT` | A single precision (32-bit) IEEE 754 floating-point number | +| `DOUBLE` | A double-precision (64-bit) IEEE 754 floating-point number | +| `BYTES` | A sequence of 8-bit unsigned bytes | +| `STRING` | A Unicode character sequence | +| `TIMESTAMP` (`DATE`, `TIME`) | A logic type represents a specific instant in time with millisecond precision.
    It stores the number of milliseconds since `January 1, 1970, 00:00:00 GMT` as an `INT64` value | +| INSTANT | A single instantaneous point on the time-line with nanoseconds precision| +| LOCAL_DATE | An immutable date-time object that represents a date, often viewed as year-month-day| +| LOCAL_TIME | An immutable date-time object that represents a time, often viewed as hour-minute-second. Time is represented to nanosecond precision.| +| LOCAL_DATE_TIME | An immutable date-time object that represents a date-time, often viewed as year-month-day-hour-minute-second | + +For primitive types, Pulsar does not store any schema data in `SchemaInfo`. The `type` in `SchemaInfo` is used to determine how to serialize and deserialize the data. + +Some of the primitive schema implementations can use `properties` to store implementation-specific tunable settings. For example, a `string` schema can use `properties` to store the encoding charset to serialize and deserialize strings. + +The conversions between **Pulsar schema types** and **language-specific primitive types** are as below. + +| Schema Type | Java Type| Python Type | Go Type | +|---|---|---|---| +| BOOLEAN | boolean | bool | bool | +| INT8 | byte | | int8 | +| INT16 | short | | int16 | +| INT32 | int | | int32 | +| INT64 | long | | int64 | +| FLOAT | float | float | float32 | +| DOUBLE | double | float | float64| +| BYTES | byte[], ByteBuffer, ByteBuf | bytes | []byte | +| STRING | string | str | string| +| TIMESTAMP | java.sql.Timestamp | | | +| TIME | java.sql.Time | | | +| DATE | java.util.Date | | | +| INSTANT | java.time.Instant | | | +| LOCAL_DATE | java.time.LocalDate | | | +| LOCAL_TIME | java.time.LocalDateTime | | +| LOCAL_DATE_TIME | java.time.LocalTime | | + +**Example** + +This example demonstrates how to use a string schema. + +1. Create a producer with a string schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.STRING).create(); + producer.newMessage().value("Hello Pulsar!").send(); + + ``` + +2. Create a consumer with a string schema and receive messages. + + ```java + + Consumer consumer = client.newConsumer(Schema.STRING).subscribe(); + consumer.receive(); + + ``` + +### Complex type + +Currently, Pulsar supports the following complex types: + +| Complex Type | Description | +|---|---| +| `keyvalue` | Represents a complex type of a key/value pair. | +| `struct` | Handles structured data. It supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. | + +#### keyvalue + +`Keyvalue` schema helps applications define schemas for both key and value. + +For `SchemaInfo` of `keyvalue` schema, Pulsar stores the `SchemaInfo` of key schema and the `SchemaInfo` of value schema together. + +Pulsar provides the following methods to encode a key/value pair in messages: + +* `INLINE` + +* `SEPARATED` + +You can choose the encoding type when constructing the key/value schema. + +````mdx-code-block + + + + +Key/value pairs are encoded together in the message payload. + + + + +Key is encoded in the message key and the value is encoded in the message payload. + +**Example** + +This example shows how to construct a key/value schema and then use it to produce and consume messages. + +1. Construct a key/value schema with `INLINE` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.INLINE + ); + + ``` + +2. Optionally, construct a key/value schema with `SEPARATED` encoding type. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + ``` + +3. Produce messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Producer> producer = client.newProducer(kvSchema) + .topic(TOPIC) + .create(); + + final int key = 100; + final String value = "value-100"; + + // send the key/value message + producer.newMessage() + .value(new KeyValue(key, value)) + .send(); + + ``` + +4. Consume messages using a key/value schema. + + ```java + + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Consumer> consumer = client.newConsumer(kvSchema) + ... + .topic(TOPIC) + .subscriptionName(SubscriptionName).subscribe(); + + // receive key/value pair + Message> msg = consumer.receive(); + KeyValue kv = msg.getValue(); + + ``` + + + + +```` + +#### struct + +This section describes the details of type and usage of the `struct` schema. + +##### Type + +`struct` schema supports `AvroBaseStructSchema` and `ProtobufNativeSchema`. + +|Type|Description| +---|---| +`AvroBaseStructSchema`|Pulsar uses [Avro Specification](http://avro.apache.org/docs/current/spec.html) to declare the schema definition for `AvroBaseStructSchema`, which supports `AvroSchema`, `JsonSchema`, and `ProtobufSchema`.

    This allows Pulsar:
    - to use the same tools to manage schema definitions
    - to use different serialization or deserialization methods to handle data| +`ProtobufNativeSchema`|`ProtobufNativeSchema` is based on protobuf native Descriptor.

    This allows Pulsar:
    - to use native protobuf-v3 to serialize or deserialize data
    - to use `AutoConsume` to deserialize data. + +##### Usage + +Pulsar provides the following methods to use the `struct` schema: + +* `static` + +* `generic` + +* `SchemaDefinition` + +````mdx-code-block + + + + +You can predefine the `struct` schema, which can be a POJO in Java, a `struct` in Go, or classes generated by Avro or Protobuf tools. + +**Example** + +Pulsar gets the schema definition from the predefined `struct` using an Avro library. The schema definition is the schema data stored as a part of the `SchemaInfo`. + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `struct` schema and send messages. + + ```java + + Producer producer = client.newProducer(Schema.AVRO(User.class)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `struct` schema and receive messages + + ```java + + Consumer consumer = client.newConsumer(Schema.AVRO(User.class)).subscribe(); + User user = consumer.receive(); + + ``` + + + + +Sometimes applications do not have pre-defined structs, and you can use this method to define schema and access data. + +You can define the `struct` schema using the `GenericSchemaBuilder`, generate a generic struct using `GenericRecordBuilder` and consume messages into `GenericRecord`. + +**Example** + +1. Use `RecordSchemaBuilder` to build a schema. + + ```java + + RecordSchemaBuilder recordSchemaBuilder = SchemaBuilder.record("schemaName"); + recordSchemaBuilder.field("intField").type(SchemaType.INT32); + SchemaInfo schemaInfo = recordSchemaBuilder.build(SchemaType.AVRO); + + Producer producer = client.newProducer(Schema.generic(schemaInfo)).create(); + + ``` + +2. Use `RecordBuilder` to build the struct records. + + ```java + + producer.newMessage().value(schema.newRecordBuilder() + .set("intField", 32) + .build()).send(); + + ``` + + + + +You can define the `schemaDefinition` to generate a `struct` schema. + +**Example** + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```java + + @Builder + @AllArgsConstructor + @NoArgsConstructor + public static class User { + String name; + int age; + } + + ``` + +2. Create a producer with a `SchemaDefinition` and send messages. + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Producer producer = client.newProducer(Schema.AVRO(schemaDefinition)).create(); + producer.newMessage().value(User.builder().name("pulsar-user").age(1).build()).send(); + + ``` + +3. Create a consumer with a `SchemaDefinition` schema and receive messages + + ```java + + SchemaDefinition schemaDefinition = SchemaDefinition.builder().withPojo(User.class).build(); + Consumer consumer = client.newConsumer(Schema.AVRO(schemaDefinition)).subscribe(); + User user = consumer.receive().getValue(); + + ``` + + + + +```` + +### Auto Schema + +If you don't know the schema type of a Pulsar topic in advance, you can use AUTO schema to produce or consume generic records to or from brokers. + +| Auto Schema Type | Description | +|---|---| +| `AUTO_PRODUCE` | This is useful for transferring data **from a producer to a Pulsar topic that has a schema**. | +| `AUTO_CONSUME` | This is useful for transferring data **from a Pulsar topic that has a schema to a consumer**. | + +#### AUTO_PRODUCE + +`AUTO_PRODUCE` schema helps a producer validate whether the bytes sent by the producer is compatible with the schema of a topic. + +**Example** + +Suppose that: + +* You have a producer processing messages from a Kafka topic _K_. + +* You have a Pulsar topic _P_, and you do not know its schema type. + +* Your application reads the messages from _K_ and writes the messages to _P_. + +In this case, you can use `AUTO_PRODUCE` to verify whether the bytes produced by _K_ can be sent to _P_ or not. + +```java + +Produce pulsarProducer = client.newProducer(Schema.AUTO_PRODUCE()) + … + .create(); + +byte[] kafkaMessageBytes = … ; + +pulsarProducer.produce(kafkaMessageBytes); + +``` + +#### AUTO_CONSUME + +`AUTO_CONSUME` schema helps a Pulsar topic validate whether the bytes sent by a Pulsar topic is compatible with a consumer, that is, the Pulsar topic deserializes messages into language-specific objects using the `SchemaInfo` retrieved from broker-side. + +Currently, `AUTO_CONSUME` supports AVRO, JSON and ProtobufNativeSchema schemas. It deserializes messages into `GenericRecord`. + +**Example** + +Suppose that: + +* You have a Pulsar topic _P_. + +* You have a consumer (for example, MySQL) receiving messages from the topic _P_. + +* Your application reads the messages from _P_ and writes the messages to MySQL. + +In this case, you can use `AUTO_CONSUME` to verify whether the bytes produced by _P_ can be sent to MySQL or not. + +```java + +Consumer pulsarConsumer = client.newConsumer(Schema.AUTO_CONSUME()) + … + .subscribe(); + +Message msg = consumer.receive() ; +GenericRecord record = msg.getValue(); + +``` + +### Native Avro Schema + +When migrating or ingesting event or message data from external systems (such as Kafka and Cassandra), the events are often already serialized in Avro format. The applications producing the data typically have validated the data against their schemas (including compatibility checks) and stored them in a database or a dedicated service (such as a schema registry). The schema of each serialized data record is usually retrievable by some metadata attached to that record. In such cases, a Pulsar producer doesn't need to repeat the schema validation step when sending the ingested events to a topic. All it needs to do is passing each message or event with its schema to Pulsar. + +Hence, we provide `Schema.NATIVE_AVRO` to wrap a native Avro schema of type `org.apache.avro.Schema`. The result is a schema instance of Pulsar that accepts a serialized Avro payload without validating it against the wrapped Avro schema. + +**Example** + +```java + +org.apache.avro.Schema nativeAvroSchema = … ; + +Producer producer = pulsarClient.newProducer().topic("ingress").create(); + +byte[] content = … ; + +producer.newMessage(Schema.NATIVE_AVRO(nativeAvroSchema)).value(content).send(); + +``` + +## Schema version + +Each `SchemaInfo` stored with a topic has a version. Schema version manages schema changes happening within a topic. + +Messages produced with a given `SchemaInfo` is tagged with a schema version, so when a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and then use the `SchemaInfo` to deserialize data. + +Schemas are versioned in succession. Schema storage happens in a broker that handles the associated topics so that version assignments can be made. + +Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + +**Example** + +The following example illustrates how the schema version works. + +Suppose that a Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begins to send messages: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); + +``` + +The table below lists the possible scenarios when this connection attempt occurs and what happens in each scenario: + +| Scenario | What happens | +| --- | --- | +|
  • No schema exists for the topic.
  • | (1) The producer is created using the given schema. (2) Since no existing schema is compatible with the `SensorReading` schema, the schema is transmitted to the broker and stored. (3) Any consumer created using the same schema or topic can consume messages from the `sensor-data` topic. | +|
  • A schema already exists.
  • The producer connects using the same schema that is already stored.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible. (3) The broker attempts to store the schema in [BookKeeper](concepts-architecture-overview.md#persistent-storage) but then determines that it's already stored, so it is used to tag produced messages. |
  • A schema already exists.
  • The producer connects using a new schema that is compatible.
  • | (1) The schema is transmitted to the broker. (2) The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). | + +## How does schema work + +Pulsar schemas are applied and enforced at the **topic** level (schemas cannot be applied at the namespace or tenant level). + +Producers and consumers upload schemas to brokers, so Pulsar schemas work on the producer side and the consumer side. + +### Producer side + +This diagram illustrates how does schema work on the Producer side. + +![Schema works at the producer side](/assets/schema-producer.png) + +1. The application uses a schema instance to construct a producer instance. + + The schema instance defines the schema for the data being produced using the producer instance. + + Take AVRO as an example, Pulsar extracts schema definition from the POJO class and constructs the `SchemaInfo` that the producer needs to pass to a broker when it connects. + +2. The producer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the producer. + +5. If no, the broker verifies whether a schema can be automatically created in this namespace: + + * If `isAllowAutoUpdateSchema` sets to **true**, then a schema can be created, and the broker validates the schema based on the schema compatibility check strategy defined for the topic. + + * If `isAllowAutoUpdateSchema` sets to **false**, then a schema can not be created, and the producer is rejected to connect to the broker. + +**Tip**: + +`isAllowAutoUpdateSchema` can be set via **Pulsar admin API** or **REST API.** + +For how to set `isAllowAutoUpdateSchema` via Pulsar admin API, see [Manage AutoUpdate Strategy](schema-manage.md/#manage-autoupdate-strategy). + +6. If the schema is allowed to be updated, then the compatible strategy check is performed. + + * If the schema is compatible, the broker stores it and returns the schema version to the producer. + + All the messages produced by this producer are tagged with the schema version. + + * If the schema is incompatible, the broker rejects it. + +### Consumer side + +This diagram illustrates how does Schema work on the consumer side. + +![Schema works at the consumer side](/assets/schema-consumer.png) + +1. The application uses a schema instance to construct a consumer instance. + + The schema instance defines the schema that the consumer uses for decoding messages received from a broker. + +2. The consumer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker determines whether the topic has one of them (a schema/data/a local consumer and a local producer). + +4. If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + +5. If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +6. The consumer receives messages from the broker. + + If the schema used by the consumer supports schema versioning (for example, AVRO schema), the consumer fetches the `SchemaInfo` of the version tagged in messages and uses the passed-in schema and the schema tagged in messages to decode the messages. diff --git a/site2/website/versioned_docs/version-2.9.x/security-athenz.md b/site2/website/versioned_docs/version-2.9.x/security-athenz.md new file mode 100644 index 0000000000000..8a39fe25316d0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-athenz.md @@ -0,0 +1,98 @@ +--- +id: security-athenz +title: Authentication using Athenz +sidebar_label: "Authentication using Athenz" +original_id: security-athenz +--- + +[Athenz](https://github.com/AthenZ/athenz) is a role-based authentication/authorization system. In Pulsar, you can use Athenz role tokens (also known as *z-tokens*) to establish the identify of the client. + +## Athenz authentication settings + +A [decentralized Athenz system](https://github.com/AthenZ/athenz/blob/master/docs/decent_authz_flow.md) contains an [authori**Z**ation **M**anagement **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/AthenZ/athenz/blob/master/docs/setup_zts) (ZTS) server. + +To begin, you need to set up Athenz service access control. You need to create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a [tenant](reference-terminology.md#tenant) in Pulsar). + +### Create the tenant domain and service + +On the [tenant](reference-terminology.md#tenant) side, you need to do the following things: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that you need to specify the private key generated in step 2 when the Pulsar client connects to the [broker](reference-terminology.md#broker) (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to do the following things: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that you can specify any action and resource in step 2 since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, refer to [Example Service Access Control Setup](https://github.com/AthenZ/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +## Configure the broker for Athenz + +> ### TLS encryption +> +> Note that when you are using Athenz as an authentication provider, you had better use TLS encryption +> as it can protect role tokens from being intercepted and reused. (for more details involving TLS encryption see [Architecture - Data Model](https://github.com/AthenZ/athenz/blob/master/docs/data_model)). + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties + +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +brokerClientAuthenticationParameters={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +``` + +> A full listing of parameters is available in the `conf/broker.conf` file, you can also find the default +> values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +## Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +## Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following authentication parameters to the `conf/client.conf` config file to use Athenz with CLI tools of Pulsar: + +```properties + +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/security-authorization.md b/site2/website/versioned_docs/version-2.9.x/security-authorization.md new file mode 100644 index 0000000000000..5b0c356602746 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-authorization.md @@ -0,0 +1,133 @@ +--- +id: security-authorization +title: Authentication and authorization in Pulsar +sidebar_label: "Authorization and ACLs" +original_id: security-authorization +--- + + +In Pulsar, the [authentication provider](security-overview.md#authentication-providers) is responsible for properly identifying clients and associating the clients with [role tokens](security-overview.md#role-tokens). If you only enable authentication, an authenticated role token has the ability to access all resources in the cluster. *Authorization* is the process that determines *what* clients are able to do. + +The role tokens with the most privileges are the *superusers*. The *superusers* can create and destroy tenants, along with having full access to all tenant resources. + +When a superuser creates a [tenant](reference-terminology.md#tenant), that tenant is assigned an admin role. A client with the admin role token can then create, modify and destroy namespaces, and grant and revoke permissions to *other role tokens* on those namespaces. + +## Broker and Proxy Setup + +### Enable authorization and assign superusers +You can enable the authorization and assign the superusers in the broker ([`conf/broker.conf`](reference-configuration.md#broker)) configuration files. + +```properties + +authorizationEnabled=true +superUserRoles=my-super-user-1,my-super-user-2 + +``` + +> A full list of parameters is available in the `conf/broker.conf` file. +> You can also find the default values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +Typically, you use superuser roles for administrators, clients as well as broker-to-broker authorization. When you use [geo-replication](concepts-replication.md), every broker needs to be able to publish to all the other topics of clusters. + +You can also enable the authorization for the proxy in the proxy configuration file (`conf/proxy.conf`). Once you enable the authorization on the proxy, the proxy does an additional authorization check before forwarding the request to a broker. +If you enable authorization on the broker, the broker checks the authorization of the request when the broker receives the forwarded request. + +### Proxy Roles + +By default, the broker treats the connection between a proxy and the broker as a normal user connection. The broker authenticates the user as the role configured in `proxy.conf`(see ["Enable TLS Authentication on Proxies"](security-tls-authentication.md#enable-tls-authentication-on-proxies)). However, when the user connects to the cluster through a proxy, the user rarely requires the authentication. The user expects to be able to interact with the cluster as the role for which they have authenticated with the proxy. + +Pulsar uses *Proxy roles* to enable the authentication. Proxy roles are specified in the broker configuration file, [`conf/broker.conf`](reference-configuration.md#broker). If a client that is authenticated with a broker is one of its ```proxyRoles```, all requests from that client must also carry information about the role of the client that is authenticated with the proxy. This information is called the *original principal*. If the *original principal* is absent, the client is not able to access anything. + +You must authorize both the *proxy role* and the *original principal* to access a resource to ensure that the resource is accessible via the proxy. Administrators can take two approaches to authorize the *proxy role* and the *original principal*. + +The more secure approach is to grant access to the proxy roles each time you grant access to a resource. For example, if you have a proxy role named `proxy1`, when the superuser creates a tenant, you should specify `proxy1` as one of the admin roles. When a role is granted permissions to produce or consume from a namespace, if that client wants to produce or consume through a proxy, you should also grant `proxy1` the same permissions. + +Another approach is to make the proxy role a superuser. This allows the proxy to access all resources. The client still needs to authenticate with the proxy, and all requests made through the proxy have their role downgraded to the *original principal* of the authenticated client. However, if the proxy is compromised, a bad actor could get full access to your cluster. + +You can specify the roles as proxy roles in [`conf/broker.conf`](reference-configuration.md#broker). + +```properties + +proxyRoles=my-proxy-role + +# if you want to allow superusers to use the proxy (see above) +superUserRoles=my-super-user-1,my-super-user-2,my-proxy-role + +``` + +## Administer tenants + +Pulsar [instance](reference-terminology.md#instance) administrators or some kind of self-service portal typically provisions a Pulsar [tenant](reference-terminology.md#tenant). + +You can manage tenants using the [`pulsar-admin`](reference-pulsar-admin.md) tool. + +### Create a new tenant + +The following is an example tenant creation command: + +```shell + +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east + +``` + +This command creates a new tenant `my-tenant` that is allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identifies itself as having the role `my-admin-role` is allowed to perform all administrative tasks on this tenant. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and namespaces: + +```shell + +persistent://tenant/namespace/topic + +``` + +### Manage permissions + +You can use [Pulsar Admin Tools](admin-api-permissions.md) for managing permission in Pulsar. + +### Pulsar admin authentication + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("http://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .build(); + +``` + +To use TLS: + +```java + +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("https://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .tlsTrustCertsFilePath("/path/to/trust/cert") + .build(); + +``` + +## Authorize an authenticated client with multiple roles + +When a client is identified with multiple roles in a token (the type of role claim in the token is an array) during the authentication process, Pulsar supports to check the permissions of all the roles and further authorize the client as long as one of its roles has the required permissions. + +:::note + +This authorization method is only compatible with [JWT authentication](security-jwt.md). + +::: + +To enable this authorization method, configure the authorization provider as `MultiRolesTokenAuthorizationProvider` in the `conf/broker.conf` file. + + ```properties + + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.MultiRolesTokenAuthorizationProvider + + ``` + diff --git a/site2/website/versioned_docs/version-2.9.x/security-basic-auth.md b/site2/website/versioned_docs/version-2.9.x/security-basic-auth.md new file mode 100644 index 0000000000000..80cb0edeaec8d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-basic-auth.md @@ -0,0 +1,154 @@ +--- +id: security-basic-auth +title: Authentication using HTTP basic +sidebar_label: "Authentication using HTTP basic" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +[Basic authentication](https://en.wikipedia.org/wiki/Basic_access_authentication) is a simple authentication scheme built into the HTTP protocol, which uses base64-encoded username and password pairs as credentials. + +## Prerequisites + +Install [`htpasswd`](https://httpd.apache.org/docs/2.4/programs/htpasswd.html) in your environment to create a password file for storing username-password pairs. + +* For Ubuntu/Debian, run the following command to install `htpasswd`. + + ``` + apt install apache2-utils + ``` + +* For CentOS/RHEL, run the following command to install `htpasswd`. + + ``` + yum install httpd-tools + ``` + +## Create your authentication file + +:::note + +Currently, you can use MD5 (recommended) and CRYPT encryption to authenticate your password. + +::: + +Create a password file named `.htpasswd` with a user account `superuser/admin`: +* Use MD5 encryption (recommended): + + ``` + htpasswd -cmb /path/to/.htpasswd superuser admin + ``` + +* Use CRYPT encryption: + + ``` + htpasswd -cdb /path/to/.htpasswd superuser admin + ``` + +You can preview the content of your password file by running the following command: + +``` +cat path/to/.htpasswd +superuser:$apr1$GBIYZYFZ$MzLcPrvoUky16mLcK6UtX/ +``` + +## Enable basic authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to the `conf/broker.conf` file. If you use a standalone Pulsar, you need to add these parameters to the `conf/standalone.conf` file. + +```conf +# Configuration to enable Basic authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Enable basic authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to the `conf/proxy.conf` file. + +```conf +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderBasic +basicAuthConf=file:///path/to/.htpasswd +# basicAuthConf=/path/to/.htpasswd +# When use the base64 format, you need to encode the .htpaswd content to bas64 +# basicAuthConf=data:;base64,YOUR-BASE64 +# basicAuthConf=YOUR-BASE64 +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +brokerClientAuthenticationParameters={"userId":"superuser","password":"admin"} +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true +``` + +:::note + +You can also set an environment variable named `PULSAR_EXTRA_OPTS` and the value is `-Dpulsar.auth.basic.conf=/path/to/.htpasswd`. Pulsar reads this environment variable to implement HTTP basic authentication. + +::: + +## Configure basic authentication in CLI tools + +[Command-line tools](/docs/next/reference-cli-tools), such as [Pulsar-admin](/tools/pulsar-admin/), [Pulsar-perf](/tools/pulsar-perf/) and [Pulsar-client](/tools/pulsar-client/), use the `conf/client.conf` file in your Pulsar installation. To configure basic authentication in Pulsar CLI tools, you need to add the following parameters to the `conf/client.conf` file. + +```conf +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationBasic +authParams={"userId":"superuser","password":"admin"} +``` + +## Configure basic authentication in Pulsar clients + +The following example shows how to configure basic authentication when using Pulsar clients. + + + + + ```java + AuthenticationBasic auth = new AuthenticationBasic(); + auth.configure("{\"userId\":\"superuser\",\"password\":\"admin\"}"); + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650") + .authentication(auth) + .build(); + ``` + + + + + ```c++ + #include + + int main() { + pulsar::ClientConfiguration config; + AuthenticationPtr auth = pulsar::AuthBasic::create("admin", "123456") + config.setAuth(auth); + pulsar::Client client("pulsar://broker.example.com:6650/", config); + + return 0; + } + ``` + + + diff --git a/site2/website/versioned_docs/version-2.9.x/security-bouncy-castle.md b/site2/website/versioned_docs/version-2.9.x/security-bouncy-castle.md new file mode 100644 index 0000000000000..be937055d8e31 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-bouncy-castle.md @@ -0,0 +1,157 @@ +--- +id: security-bouncy-castle +title: Bouncy Castle Providers +sidebar_label: "Bouncy Castle Providers" +original_id: security-bouncy-castle +--- + +## BouncyCastle Introduce + +`Bouncy Castle` is a Java library that complements the default Java Cryptographic Extension (JCE), +and it provides more cipher suites and algorithms than the default JCE provided by Sun. + +In addition to that, `Bouncy Castle` has lots of utilities for reading arcane formats like PEM and ASN.1 that no sane person would want to rewrite themselves. + +In Pulsar, security and crypto have dependencies on BouncyCastle Jars. For the detailed installing and configuring Bouncy Castle FIPS, see [BC FIPS Documentation](https://www.bouncycastle.org/documentation.html), especially the **User Guides** and **Security Policy** PDFs. + +`Bouncy Castle` provides both [FIPS](https://www.bouncycastle.org/fips_faq.html) and non-FIPS version. But in a JVM, you can not include both of the 2 versions, and you need to exclude the current version before include the other. + +In Pulsar, the security and crypto methods also depends on `Bouncy Castle`, especially in [TLS Authentication](security-tls-authentication.md) and [Transport Encryption](security-encryption.md). This document contains the configuration between BouncyCastle FIPS(BC-FIPS) and non-FIPS(BC-non-FIPS) version while using Pulsar. + +## How BouncyCastle modules packaged in Pulsar + +In Pulsar's `bouncy-castle` module, We provide 2 sub modules: `bouncy-castle-bc`(for non-FIPS version) and `bouncy-castle-bcfips`(for FIPS version), to package BC jars together to make the include and exclude of `Bouncy Castle` easier. + +To achieve this goal, we will need to package several `bouncy-castle` jars together into `bouncy-castle-bc` or `bouncy-castle-bcfips` jar. +Each of the original bouncy-castle jar is related with security, so BouncyCastle dutifully supplies signed of each JAR. +But when we do the re-package, Maven shade explodes the BouncyCastle jar file which puts the signatures into META-INF, +these signatures aren't valid for this new, uber-jar (signatures are only for the original BC jar). +Usually, You will meet error like `java.lang.SecurityException: Invalid signature file digest for Manifest main attributes`. + +You could exclude these signatures in mvn pom file to avoid above error, by + +```access transformers + +META-INF/*.SF +META-INF/*.DSA +META-INF/*.RSA + +``` + +But it can also lead to new, cryptic errors, e.g. `java.security.NoSuchAlgorithmException: PBEWithSHA256And256BitAES-CBC-BC SecretKeyFactory not available` +By explicitly specifying where to find the algorithm like this: `SecretKeyFactory.getInstance("PBEWithSHA256And256BitAES-CBC-BC","BC")` +It will get the real error: `java.security.NoSuchProviderException: JCE cannot authenticate the provider BC` + +So, we used a [executable packer plugin](https://github.com/nthuemmel/executable-packer-maven-plugin) that uses a jar-in-jar approach to preserve the BouncyCastle signature in a single, executable jar. + +### Include dependencies of BC-non-FIPS + +Pulsar module `bouncy-castle-bc`, which defined by `bouncy-castle/bc/pom.xml` contains the needed non-FIPS jars for Pulsar, and packaged as a jar-in-jar(need to provide `pkg`). + +```xml + + + org.bouncycastle + bcpkix-jdk15on + ${bouncycastle.version} + + + + org.bouncycastle + bcprov-ext-jdk15on + ${bouncycastle.version} + + +``` + +By using this `bouncy-castle-bc` module, you can easily include and exclude BouncyCastle non-FIPS jars. + +### Modules that include BC-non-FIPS module (`bouncy-castle-bc`) + +For Pulsar client, user need the bouncy-castle module, so `pulsar-client-original` will include the `bouncy-castle-bc` module, and have `pkg` set to reference the `jar-in-jar` package. +It is included as following example: + +```xml + + + org.apache.pulsar + bouncy-castle-bc + ${pulsar.version} + pkg + + +``` + +By default `bouncy-castle-bc` already included in `pulsar-client-original`, And `pulsar-client-original` has been included in a lot of other modules like `pulsar-client-admin`, `pulsar-broker`. +But for the above shaded jar and signatures reason, we should not package Pulsar's `bouncy-castle` module into `pulsar-client-all` other shaded modules directly, such as `pulsar-client-shaded`, `pulsar-client-admin-shaded` and `pulsar-broker-shaded`. +So in the shaded modules, we will exclude the `bouncy-castle` modules. + +```xml + + + + org.apache.pulsar:pulsar-client-original + + ** + + + org/bouncycastle/** + + + + +``` + +That means, `bouncy-castle` related jars are not shaded in these fat jars. + +### Module BC-FIPS (`bouncy-castle-bcfips`) + +Pulsar module `bouncy-castle-bcfips`, which defined by `bouncy-castle/bcfips/pom.xml` contains the needed FIPS jars for Pulsar. +Similar to `bouncy-castle-bc`, `bouncy-castle-bcfips` also packaged as a `jar-in-jar` package for easy include/exclude. + +```xml + + + org.bouncycastle + bc-fips + ${bouncycastlefips.version} + + + + org.bouncycastle + bcpkix-fips + ${bouncycastlefips.version} + + +``` + +### Exclude BC-non-FIPS and include BC-FIPS + +If you want to switch from BC-non-FIPS to BC-FIPS version, Here is an example for `pulsar-broker` module: + +```xml + + + org.apache.pulsar + pulsar-broker + ${pulsar.version} + + + org.apache.pulsar + bouncy-castle-bc + + + + + + org.apache.pulsar + bouncy-castle-bcfips + ${pulsar.version} + pkg + + +``` + + +For more example, you can reference module `bcfips-include-test`. + diff --git a/site2/website/versioned_docs/version-2.9.x/security-encryption.md b/site2/website/versioned_docs/version-2.9.x/security-encryption.md new file mode 100644 index 0000000000000..c2f3530d94d9e --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-encryption.md @@ -0,0 +1,200 @@ +--- +id: security-encryption +title: Pulsar Encryption +sidebar_label: "End-to-End Encryption" +original_id: security-encryption +--- + +Applications can use Pulsar encryption to encrypt messages on the producer side and decrypt messages on the consumer side. You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +## Asymmetric and symmetric encryption + +Pulsar uses a dynamically generated symmetric AES key to encrypt messages(data). You can use the application-provided ECDSA (Elliptic Curve Digital Signature Algorithm) or RSA (Rivest–Shamir–Adleman) key pair to encrypt the AES key(data key), so you do not have to share the secret with everyone. + +Key is a public and private key pair used for encryption or decryption. The producer key is the public key of the key pair, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. You can use this key to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key (in this case the consumer) are able to decrypt the data key which is used to decrypt the message. + +You can encrypt a message with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message. + +Pulsar does not store the encryption key anywhere in the Pulsar service. If you lose or delete the private key, your message is irretrievably lost, and is unrecoverable. + +## Producer +![alt text](/assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](/assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Get started + +1. Create your ECDSA or RSA public and private key pair by using the following commands. + * ECDSA(for Java clients only) + + ```shell + + openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem + openssl ec -in test_ecdsa_privkey.pem -pubout -outform pem -out test_ecdsa_pubkey.pem + + ``` + + * RSA (for C++, Python and Node.js clients) + + ```shell + + openssl genrsa -out test_rsa_privkey.pem 2048 + openssl rsa -in test_rsa_privkey.pem -pubout -outform pkcs8 -out test_rsa_pubkey.pem + + ``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. + +3. Implement the `CryptoKeyReader` interface, specifically `CryptoKeyReader.getPublicKey()` for producer and `CryptoKeyReader.getPrivateKey()` for consumer, which Pulsar client invokes to load the key. + +4. Add the encryption key name to the producer builder: PulsarClient.newProducer().addEncryptionKey("myapp.key"). + +5. Add CryptoKeyReader implementation to producer or consumer builder: PulsarClient.newProducer().cryptoKeyReader(keyReader) / PulsarClient.newConsumer().cryptoKeyReader(keyReader). + +6. Sample producer application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + +Producer producer = pulsarClient.newProducer() + .topic("persistent://my-tenant/my-ns/my-topic") + .addEncryptionKey("myappkey") + .cryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")) + .create(); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +producer.close(); +pulsarClient.close(); + +``` + +7. Sample Consumer Application: + +```java + +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); +Consumer consumer = pulsarClient.newConsumer() + .topic("persistent://my-tenant/my-ns/my-topic") + .subscriptionName("my-subscriber-name") + .cryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")) + .subscribe(); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +consumer.close(); +pulsarClient.close(); + +``` + +## Key rotation +Pulsar generates a new AES data key every 4 hours or after publishing a certain number of messages. A producer fetches the asymmetric public key every 4 hours by calling CryptoKeyReader.getPublicKey() to retrieve the latest version. + +## Enable encryption at the producer application +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. You can do this in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys. +2. You grant access to one of the private keys from the pairs that producer uses. + +When producers want to encrypt the messages with multiple keys, producers add all such keys to the config. Consumer can decrypt the message as long as the consumer has access to at least one of the keys. + +If you need to encrypt the messages using 2 keys (`myapp.messagekey1` and `myapp.messagekey2`), refer to the following example. + +```java + +PulsarClient.newProducer().addEncryptionKey("myapp.messagekey1").addEncryptionKey("myapp.messagekey2"); + +``` + +## Decrypt encrypted messages at the consumer application +Consumers require to access one of the private keys to decrypt messages that the producer produces. If you want to receive encrypted messages, create a public or private key and give your public key to the producer application to encrypt messages using your public key. + +## Handle failures +* Producer/Consumer loses access to the key + * Producer action fails to indicate the cause of the failure. Application has the option to proceed with sending unencrypted messages in such cases. Call `PulsarClient.newProducer().cryptoFailureAction(ProducerCryptoFailureAction)` to control the producer behavior. The default behavior is to fail the request. + * If consumption fails due to decryption failure or missing keys in consumer, the application has the option to consume the encrypted message or discard it. Call `PulsarClient.newConsumer().cryptoFailureAction(ConsumerCryptoFailureAction)` to control the consumer behavior. The default behavior is to fail the request. Application is never able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contains batch messages, client is not able to retrieve individual messages in the batch, hence message consumption fails even if cryptoFailureAction() is set to `ConsumerCryptoFailureAction.CONSUME`. +* If decryption fails, the message consumption stops and the application notices backlog growth in addition to decryption failure messages in the client log. If the application does not have access to the private key to decrypt the message, the only option is to skip or discard backlogged messages. diff --git a/site2/website/versioned_docs/version-2.9.x/security-extending.md b/site2/website/versioned_docs/version-2.9.x/security-extending.md new file mode 100644 index 0000000000000..e7484453b8beb --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-extending.md @@ -0,0 +1,207 @@ +--- +id: security-extending +title: Extending Authentication and Authorization in Pulsar +sidebar_label: "Extending" +original_id: security-extending +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms. + +## Authentication + +Pulsar supports mutual TLS and Athenz authentication plugins. For how to use these authentication plugins, you can refer to the description in [Security](security-overview.md). + +You can use a custom authentication mechanism by providing the implementation in the form of two plugins. One plugin is for the Client library and the other plugin is for the Pulsar Proxy and/or Pulsar Broker to validate the credentials. + +### Client authentication plugin + +For the client library, you need to implement `org.apache.pulsar.client.api.Authentication`. By entering the command below you can pass this class when you create a Pulsar client: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); + +``` + +You can use 2 interfaces to implement on the client side: + * `Authentication` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/Authentication.html + * `AuthenticationDataProvider` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html + + +This in turn needs to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider`. This leaves the chance to return different kinds of authentication token for different types of connection or by passing a certificate chain to use for TLS. + + +You can find examples for client authentication providers at: + + * Mutual TLS Auth -- https://github.com/apache/pulsar/tree/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth + * Athenz -- https://github.com/apache/pulsar/tree/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth + +### Proxy/Broker authentication plugin + +On the proxy/broker side, you need to configure the corresponding plugin to validate the credentials that the client sends. The Proxy and Broker can support multiple authentication providers at the same time. + +In `conf/broker.conf` you can choose to specify a list of valid providers: + +```properties + +# Authentication provider name list, which is comma separated list of class names +authenticationProviders= + +``` + +To implement `org.apache.pulsar.broker.authentication.AuthenticationProvider` on one single interface: + +```java + +/** + * Provider of authentication mechanism + */ +public interface AuthenticationProvider extends Closeable { + + /** + * Perform initialization for the authentication provider + * + * @param config + * broker config object + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration config) throws IOException; + + /** + * @return the authentication method name supported by this provider + */ + String getAuthMethodName(); + + /** + * Validate the authentication for the given credentials with the specified authentication data + * + * @param authData + * provider specific authentication data + * @return the "role" string for the authenticated connection, if the authentication was successful + * @throws AuthenticationException + * if the credentials are not valid + */ + String authenticate(AuthenticationDataSource authData) throws AuthenticationException; + +} + +``` + +The following is the example for Broker authentication plugins: + + * Mutual TLS -- https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java + * Athenz -- https://github.com/apache/pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" has permission to perform a certain operation. + +By default, you can use the embedded authorization provider provided by Pulsar. You can also configure a different authorization provider through a plugin. +Note that although the Authentication plugin is designed for use in both the Proxy and Broker, +the Authorization plugin is designed only for use on the Broker however the Proxy does perform some simple Authorization checks of Roles if authorization is enabled. + +To provide a custom provider, you need to implement the `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, put this class in the Pulsar broker classpath and configure the class in `conf/broker.conf`: + + ```properties + + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + + ``` + +```java + +/** + * Provider of authorization mechanism + */ +public interface AuthorizationProvider extends Closeable { + + /** + * Perform initialization for the authorization provider + * + * @param conf + * broker config object + * @param configCache + * pulsar zk configuration cache service + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration conf, ConfigurationCacheService configCache) throws IOException; + + /** + * Check if the specified role has permission to send messages to the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to send messages to the topic. + */ + CompletableFuture canProduceAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * Check if the specified role has permission to receive messages from the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to receive messages from the topic. + * @param subscription + * the subscription name defined by the client + */ + CompletableFuture canConsumeAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData, String subscription); + + /** + * Check whether the specified role can perform a lookup for the specified topic. + * + * For that the caller needs to have producer or consumer permission. + * + * @param topicName + * @param role + * @return + * @throws Exception + */ + CompletableFuture canLookupAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * + * Grant authorization-action permission on a namespace to the given client + * + * @param namespace + * @param actions + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(NamespaceName namespace, Set actions, String role, + String authDataJson); + + /** + * Grant authorization-action permission on a topic to the given client + * + * @param topicName + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(TopicName topicName, Set actions, String role, + String authDataJson); + +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/security-jwt.md b/site2/website/versioned_docs/version-2.9.x/security-jwt.md new file mode 100644 index 0000000000000..1fa65b7c27f60 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-jwt.md @@ -0,0 +1,331 @@ +--- +id: security-jwt +title: Client authentication using tokens based on JSON Web Tokens +sidebar_label: "Authentication using JWT" +original_id: security-jwt +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +## Token authentication overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +You can use tokens to identify a Pulsar client and associate with some "principal" (or "role") that +is permitted to do some actions (eg: publish to a topic or consume from a topic). + +A user typically gets a token string from the administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like as the following: + +``` + +eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application specifies the token when you create the client instance. An alternative is to pass a "token supplier" (a function that returns the token when the client library needs one). + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. You had better use TLS encryption all the time when you connect to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) for more details. + +### CLI Tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use the token authentication with CLI tools of Pulsar: + +```properties + +webServiceUrl=http://broker.example.com:8080/ +brokerServiceUrl=pulsar://broker.example.com:6650/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +authParams=token:eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +The token string can also be read from a file, for example: + +``` + +authParams=file:///path/to/token/file + +``` + +### Pulsar client + +You can use tokens to authenticate the following Pulsar clients. + +````mdx-code-block + + + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")) + .build(); + +``` + +Similarly, you can also pass a `Supplier`: + +```java + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token(() -> { + // Read token from custom source + return readToken(); + })) + .build(); + +``` + + + + +```python + +from pulsar import Client, AuthenticationToken + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken('eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY')) + +``` + +Alternatively, you can also pass a `Supplier`: + +```python + +def read_token(): + with open('/path/to/token.txt') as tf: + return tf.read().strip() + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken(read_token)) + +``` + + + + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY"), +}) + +``` + +Similarly, you can also pass a `Supplier`: + +```go + +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationTokenSupplier(func () string { + // Read token from custom source + return readToken() + }), +}) + +``` + + + + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setAuth(pulsar::AuthToken::createWithToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + + + + +```c# + +var client = PulsarClient.Builder() + .AuthenticateUsingToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY") + .Build(); + +``` + + + + +```` + +## Enable token authentication + +On how to enable token authentication on a Pulsar cluster, you can refer to the guide below. + +JWT supports two different kinds of keys in order to generate and validate the tokens: + + * Symmetric : + - You can use a single ***Secret*** key to generate and validate tokens. + * Asymmetric: A pair of keys consists of the Private key and the Public key. + - You can use ***Private*** key to generate tokens. + - You can use ***Public*** key to validate tokens. + +### Create a secret key + +When you use a secret key, the administrator creates the key and uses the key to generate the client tokens. You can also configure this key to brokers in order to validate the clients. + +Output file is generated in the root of your Pulsar installation directory. You can also provide absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +Enter this command to generate base64 encoded private key. + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Create a key pair + +With Public and Private keys, you need to create a pair of keys. Pulsar supports all algorithms that the Java JWT library (shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys)) supports. + +Output file is generated in the root of your Pulsar installation directory. You can also provide absolute path for the output file using the command below. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * Store `my-private.key` in a safe location and only administrator can use `my-private.key` to generate new tokens. + * `my-public.key` is distributed to all Pulsar brokers. You can publicly share this file without any security concern. + +### Generate tokens + +A token is the credential associated with a user. The association is done through the "principal" or "role". In the case of JWT tokens, this field is typically referred as **subject**, though they are exactly the same concept. + +Then, you need to use this command to require the generated token to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This command prints the token string on stdout. + +Similarly, you can create a token by passing the "private" key using the command below: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, you can enter the following command to create a token with a pre-defined TTL. And then the token is automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +### Authorization + +The token itself does not have any permission associated. The authorization engine determines whether the token should have permissions or not. Once you have created the token, you can grant permission for this token to do certain actions. The following is an example. + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +### Enable token authentication on Brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +# If this flag is set then the broker authenticates the original Auth data +# else it just accepts the originalPrincipal and authorizes it (if required). +authenticateOriginalAuthData=true + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### Enable token authentication on Proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`: + +The proxy uses its own token when connecting to brokers. You need to configure the role token for this key pair in the `proxyRoles` of the brokers. For more details, see the [authorization guide](security-authorization.md). + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Either configure the token string or specify to read it from a file. The following three available formats are all valid: +# brokerClientAuthenticationParameters={"token":"your-token-string"} +# brokerClientAuthenticationParameters=token:your-token-string +# brokerClientAuthenticationParameters=file:///path/to/token + +# Whether client authorization credentials are forwarded to the broker for re-authorization. +# Authentication must be enabled via authenticationEnabled=true for this to take effect. +forwardAuthorizationCredentials=true + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/security-kerberos.md b/site2/website/versioned_docs/version-2.9.x/security-kerberos.md new file mode 100644 index 0000000000000..c49fa3bea1fce --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-kerberos.md @@ -0,0 +1,443 @@ +--- +id: security-kerberos +title: Authentication using Kerberos +sidebar_label: "Authentication using Kerberos" +original_id: security-kerberos +--- + +[Kerberos](https://web.mit.edu/kerberos/) is a network authentication protocol. By using secret-key cryptography, [Kerberos](https://web.mit.edu/kerberos/) is designed to provide strong authentication for client applications and server applications. + +In Pulsar, you can use Kerberos with [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) as a choice for authentication. And Pulsar uses the [Java Authentication and Authorization Service (JAAS)](https://en.wikipedia.org/wiki/Java_Authentication_and_Authorization_Service) for SASL configuration. You need to provide JAAS configurations for Kerberos authentication. + +This document introduces how to configure `Kerberos` with `SASL` between Pulsar clients and brokers and how to configure Kerberos for Pulsar proxy in detail. + +## Configuration for Kerberos between Client and Broker + +### Prerequisites + +To begin, you need to set up (or already have) a [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center). Also you need to configure and run the [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center)in advance. + +If your organization already uses a Kerberos server (for example, by using `Active Directory`), you do not have to install a new server for Pulsar. If your organization does not use a Kerberos server, you need to install one. Your Linux vendor might have packages for `Kerberos`. On how to install and configure Kerberos, refer to [Ubuntu](https://help.ubuntu.com/community/Kerberos), +[Redhat](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Managing_Smart_Cards/installing-kerberos.html). + +Note that if you use Oracle Java, you need to download JCE policy files for your Java version and copy them to the `$JAVA_HOME/jre/lib/security` directory. + +#### Kerberos principals + +If you use the existing Kerberos system, ask your Kerberos administrator for a principal for each Brokers in your cluster and for every operating system user that accesses Pulsar with Kerberos authentication(via clients and tools). + +If you have installed your own Kerberos system, you can create these principals with the following commands: + +```shell + +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +Note that *Kerberos* requires that all your hosts can be resolved with their FQDNs. + +The first part of Broker principal (for example, `broker` in `broker/{hostname}@{REALM}`) is the `serverType` of each host. The suggested values of `serverType` are `broker` (host machine runs service Pulsar Broker) and `proxy` (host machine runs service Pulsar Proxy). + +#### Configure how to connect to KDC + +You need to enter the command below to specify the path to the `krb5.conf` file for the client side and the broker side. The content of `krb5.conf` file indicates the default Realm and KDC information. See [JDK’s Kerberos Requirements](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html) for more details. + +```shell + +-Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +Here is an example of the krb5.conf file: + +In the configuration file, `EXAMPLE.COM` is the default realm; `kdc = localhost:62037` is the kdc server url for realm `EXAMPLE.COM `: + +``` + +[libdefaults] + default_realm = EXAMPLE.COM + +[realms] + EXAMPLE.COM = { + kdc = localhost:62037 + } + +``` + +Usually machines configured with kerberos already have a system wide configuration and this configuration is optional. + +#### JAAS configuration file + +You need JAAS configuration file for the client side and the broker side. JAAS configuration file provides the section of information that is used to connect KDC. Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +You need to set the `JAAS` configuration file path as JVM parameter for client and broker. For example: + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf + +``` + +In the `pulsar_jaas.conf` file above + +1. `PulsarBroker` is a section name in the JAAS file that each broker uses. This section tells the broker to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarBroker` allows the broker to use the keytab specified in this section. +2. `PulsarClient` is a section name in the JASS file that each broker uses. This section tells the client to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarClient` allows the client to use the keytab specified in this section. + The following example also reuses this `PulsarClient` section in both the Pulsar internal admin configuration and in CLI command of `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`. You can also add different sections for different use cases. + +You can have 2 separate JAAS configuration files: +* the file for a broker that has sections of both `PulsarBroker` and `PulsarClient`; +* the file for a client that only has a `PulsarClient` section. + + +### Kerberos configuration for Brokers + +#### Configure the `broker.conf` file + + In the `broker.conf` file, set Kerberos related configurations. + + - Set `authenticationEnabled` to `true`; + - Set `authenticationProviders` to choose `AuthenticationProviderSasl`; + - Set `saslJaasClientAllowedIds` regex for principal that is allowed to connect to broker; + - Set `saslJaasBrokerSectionName` that corresponds to the section in JAAS configuration file for broker; + + To make Pulsar internal admin client work properly, you need to set the configuration in the `broker.conf` file as below: + - Set `brokerClientAuthenticationPlugin` to client plugin `AuthenticationSasl`; + - Set `brokerClientAuthenticationParameters` to value in JSON string `{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}`, in which `PulsarClient` is the section name in the `pulsar_jaas.conf` file, and `"serverType":"broker"` indicates that the internal admin client connects to a Pulsar Broker; + + Here is an example: + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +## Authentication settings of the broker itself. Used when the broker connects to other brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +#### Set Broker JVM parameter + + Set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_env.sh) + +You must ensure that the operating system user who starts broker can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +### Kerberos configuration for clients + +#### Java Client and Java Admin Client + +In client application, include `pulsar-client-auth-sasl` in your project dependency. + +``` + + + org.apache.pulsar + pulsar-client-auth-sasl + ${pulsar.version} + + +``` + +Configure the authentication type to use `AuthenticationSasl`, and also provide the authentication parameters to it. + +You need 2 parameters: +- `saslJaasClientSectionName`. This parameter corresponds to the section in JAAS configuration file for client; +- `serverType`. This parameter stands for whether this client connects to broker or proxy. And client uses this parameter to know which server side principal should be used. + +When you authenticate between client and broker with the setting in above JAAS configuration file, we need to set `saslJaasClientSectionName` to `PulsarClient` and set `serverType` to `broker`. + +The following is an example of creating a Java client: + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "broker"); + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +You must ensure that the operating system user who starts pulsar client can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +#### Configure CLI tools + +If you use a command-line tool (such as `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`), you need to perform the following steps: + +Step 1. Enter the command below to configure your `client.conf`. + +```shell + +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +authParams={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} + +``` + +Step 2. Enter the command below to set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. + +```shell + + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf + +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_tools_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_tools_env.sh), +or add this line `OPTS="$OPTS -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf "` directly to the CLI tool script. + +The meaning of configurations is the same as the meaning of configurations in Java client section. + +## Kerberos configuration for working with Pulsar Proxy + +With the above configuration, client and broker can do authentication using Kerberos. + +A client that connects to Pulsar Proxy is a little different. Pulsar Proxy (as a SASL Server in Kerberos) authenticates Client (as a SASL client in Kerberos) first; and then Pulsar broker authenticates Pulsar Proxy. + +Now in comparison with the above configuration between client and broker, we show you how to configure Pulsar Proxy as follows. + +### Create principal for Pulsar Proxy in Kerberos + +You need to add new principals for Pulsar Proxy comparing with the above configuration. If you already have principals for client and broker, you only need to add the proxy principal here. + +```shell + +### add Principals for Pulsar Proxy +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey proxy/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{proxy-keytabname}.keytab proxy/{hostname}@{REALM}" +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" + +``` + +### Add a section in JAAS configuration file for Pulsar Proxy + +In comparison with the above configuration, add a new section for Pulsar Proxy in JAAS configuration file. + +Here is an example named `pulsar_jaas.conf`: + +``` + + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarProxy { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarproxy.keytab" + principal="proxy/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; + +``` + +### Proxy client configuration + +Pulsar client configuration is similar with client and broker configuration, except that you need to set `serverType` to `proxy` instead of `broker`, for the reason that you need to do the Kerberos authentication between client and proxy. + + ```java + + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "proxy"); // ** here is the different ** + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` + +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME + +``` + +### Kerberos configuration for Pulsar proxy service + +In the `proxy.conf` file, set Kerberos related configuration. Here is an example: + +```shell + +## related to authenticate client. +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarProxy + +## related to be authenticated by broker +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarProxy", "serverType":"broker"} +forwardAuthorizationCredentials=true + +``` + +The first part relates to authenticating between client and Pulsar Proxy. In this phase, client works as SASL client, while Pulsar Proxy works as SASL server. + +The second part relates to authenticating between Pulsar Proxy and Pulsar Broker. In this phase, Pulsar Proxy works as SASL client, while Pulsar Broker works as SASL server. + +### Broker side configuration. + +The broker side configuration file is the same with the above `broker.conf`, you do not need special configuration for Pulsar Proxy. + +``` + +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +``` + +## Regarding authorization and role token + +For Kerberos authentication, we usually use the authenticated principal as the role token for Pulsar authorization. For more information of authorization in Pulsar, see [security authorization](security-authorization.md). + +If you enable 'authorizationEnabled', you need to set `superUserRoles` in `broker.conf` that corresponds to the name registered in kdc. + +For example: + +```bash + +superUserRoles=client/{clientIp}@EXAMPLE.COM + +``` + +## Regarding authentication between ZooKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Zookeeper. According to [ZooKeeper document](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Client-Server+mutual+authentication), you need these settings in `conf/zookeeper.conf`: + +``` + +authProvider.1=org.apache.zookeeper.server.auth.SASLAuthenticationProvider +requireClientAuthScheme=sasl + +``` + +Enter the following commands to add a section of `Client` configurations in the file `pulsar_jaas.conf`, which Pulsar Broker uses: + +``` + + Client { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with ZooKeeper. + +## Regarding authentication between BookKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Bookie. According to [BookKeeper document](http://bookkeeper.apache.org/docs/latest/security/sasl/), you need to add `bookkeeperClientAuthenticationPlugin` parameter in `broker.conf`: + +``` + +bookkeeperClientAuthenticationPlugin=org.apache.bookkeeper.sasl.SASLClientProviderFactory + +``` + +In this setting, `SASLClientProviderFactory` creates a BookKeeper SASL client in a Broker, and the Broker uses the created SASL client to authenticate with a Bookie node. + +Enter the following commands to add a section of `BookKeeper` configurations in the `pulsar_jaas.conf` that Pulsar Broker uses: + +``` + + BookKeeper { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with Bookie. diff --git a/site2/website/versioned_docs/version-2.9.x/security-oauth2.md b/site2/website/versioned_docs/version-2.9.x/security-oauth2.md new file mode 100644 index 0000000000000..1d69f561ccb08 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-oauth2.md @@ -0,0 +1,232 @@ +--- +id: security-oauth2 +title: Client authentication using OAuth 2.0 access tokens +sidebar_label: "Authentication using OAuth 2.0 access tokens" +original_id: security-oauth2 +--- + +Pulsar supports authenticating clients using OAuth 2.0 access tokens. You can use OAuth 2.0 access tokens to identify a Pulsar client and associate the Pulsar client with some "principal" (or "role"), which is permitted to do some actions, such as publishing messages to a topic or consume messages from a topic. + +This module is used to support the Pulsar client authentication plugin for OAuth 2.0. After communicating with the Oauth 2.0 server, the Pulsar client gets an `access token` from the Oauth 2.0 server, and passes this `access token` to the Pulsar broker to do the authentication. The broker can use the `org.apache.pulsar.broker.authentication.AuthenticationProviderToken`. Or, you can add your own `AuthenticationProvider` to make it with this module. + +## Authentication provider configuration + +This library allows you to authenticate the Pulsar client by using an access token that is obtained from an OAuth 2.0 authorization service, which acts as a _token issuer_. + +### Authentication types + +The authentication type determines how to obtain an access token through an OAuth 2.0 authorization flow. + +:::note + +Currently, the Pulsar Java client only supports the `client_credentials` authentication type. + +::: + +#### Client credentials + +The following table lists parameters supported for the `client credentials` authentication type. + +| Parameter | Description | Example | Required or not | +| --- | --- | --- | --- | +| `type` | Oauth 2.0 authentication type. | `client_credentials` (default) | Optional | +| `issuerUrl` | URL of the authentication provider which allows the Pulsar client to obtain an access token | `https://accounts.google.com` | Required | +| `privateKey` | URL to a JSON credentials file | Support the following pattern formats:
  • `file:///path/to/file`
  • `file:/path/to/file`
  • `data:application/json;base64,`
  • | Required | +| `audience` | An OAuth 2.0 "resource server" identifier for the Pulsar cluster | `https://broker.example.com` | Optional | + +The credentials file contains service account credentials used with the client authentication type. The following shows an example of a credentials file `credentials_file.json`. + +```json + +{ + "type": "client_credentials", + "client_id": "d9ZyX97q1ef8Cr81WHVC4hFQ64vSlDK3", + "client_secret": "on1uJ...k6F6R", + "client_email": "1234567890-abcdefghijklmnopqrstuvwxyz@developer.gserviceaccount.com", + "issuer_url": "https://accounts.google.com" +} + +``` + +In the above example, the authentication type is set to `client_credentials` by default. And the fields "client_id" and "client_secret" are required. + +### Typical original OAuth2 request mapping + +The following shows a typical original OAuth2 request, which is used to obtain the access token from the OAuth2 server. + +```bash + +curl --request POST \ + --url https://dev-kt-aa9ne.us.auth0.com/oauth/token \ + --header 'content-type: application/json' \ + --data '{ + "client_id":"Xd23RHsUnvUlP7wchjNYOaIfazgeHd9x", + "client_secret":"rT7ps7WY8uhdVuBTKWZkttwLdQotmdEliaM5rLfmgNibvqziZ-g07ZH52N_poGAb", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "grant_type":"client_credentials"}' + +``` + +In the above example, the mapping relationship is shown as below. + +- The `issuerUrl` parameter in this plugin is mapped to `--url https://dev-kt-aa9ne.us.auth0.com`. +- The `privateKey` file parameter in this plugin should at least contains the `client_id` and `client_secret` fields. +- The `audience` parameter in this plugin is mapped to `"audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"`. This field is optional and only used by some identity providers since 2.9.1 and later versions. + +## Client Configuration + +You can use the OAuth2 authentication provider with the following Pulsar clients. + +### Java + +You can use the factory method to configure authentication for Pulsar Java client. + +```java + +import org.apache.pulsar.client.impl.auth.oauth2.AuthenticationFactoryOAuth2; + +URL issuerUrl = new URL("https://dev-kt-aa9ne.us.auth0.com"); +URL credentialsUrl = new URL("file:///path/to/KeyFile.json"); +String audience = "https://dev-kt-aa9ne.us.auth0.com/api/v2/"; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactoryOAuth2.clientCredentials(issuerUrl, credentialsUrl, audience)) + .build(); + +``` + +In addition, you can also use the encoded parameters to configure authentication for Pulsar Java client. + +```java + +Authentication auth = AuthenticationFactory + .create(AuthenticationOAuth2.class.getName(), "{"type":"client_credentials","privateKey":"./key/path/..","issuerUrl":"...","audience":"..."}"); +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication(auth) + .build(); + +``` + +### C++ client + +The C++ client is similar to the Java client. You need to provide parameters of `issuerUrl`, `private_key` (the credentials file path), and the audience. + +```c++ + +#include + +pulsar::ClientConfiguration config; +std::string params = R"({ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "../../pulsar-broker/src/test/resources/authentication/token/cpp_credentials_file.json", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/"})"; + +config.setAuth(pulsar::AuthOauth2::create(params)); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); + +``` + +### Go client + +To enable OAuth2 authentication in Go client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Go client. + +```go + +oauth := pulsar.NewAuthenticationOAuth2(map[string]string{ + "type": "client_credentials", + "issuerUrl": "https://dev-kt-aa9ne.us.auth0.com", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/", + "privateKey": "/path/to/privateKey", + "clientId": "0Xx...Yyxeny", + }) +client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://my-cluster:6650", + Authentication: oauth, +}) + +``` + +### Python client + +To enable OAuth2 authentication in Python client, you need to configure OAuth2 authentication. +This example shows how to configure OAuth2 authentication in Python client. + +```python + +from pulsar import Client, AuthenticationOauth2 + +params = ''' +{ + "issuer_url": "https://dev-kt-aa9ne.us.auth0.com", + "private_key": "/path/to/privateKey", + "audience": "https://dev-kt-aa9ne.us.auth0.com/api/v2/" +} +''' + +client = Client("pulsar://my-cluster:6650", authentication=AuthenticationOauth2(params)) + +``` + +## CLI configuration + +This section describes how to use Pulsar CLI tools to connect a cluster through OAuth2 authentication plugin. + +### pulsar-admin + +This example shows how to use pulsar-admin to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-admin --admin-url https://streamnative.cloud:443 \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +tenants list + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-client + +This example shows how to use pulsar-client to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-client \ +--url SERVICE_URL \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +produce test-topic -m "test-message" -n 10 + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). + +### pulsar-perf + +This example shows how to use pulsar-perf to connect to a cluster through OAuth2 authentication plugin. + +```shell script + +bin/pulsar-perf produce --service-url pulsar+ssl://streamnative.cloud:6651 \ +--auth-plugin org.apache.pulsar.client.impl.auth.oauth2.AuthenticationOAuth2 \ +--auth-params '{"privateKey":"file:///path/to/key/file.json", + "issuerUrl":"https://dev-kt-aa9ne.us.auth0.com", + "audience":"https://dev-kt-aa9ne.us.auth0.com/api/v2/"}' \ +-r 1000 -s 1024 test-topic + +``` + +Set the `admin-url` parameter to the Web service URL. A Web service URL is a combination of the protocol, hostname and port ID, such as `pulsar://localhost:6650`. +Set the `privateKey`, `issuerUrl`, and `audience` parameters to the values based on the configuration in the key file. For details, see [authentication types](#authentication-types). diff --git a/site2/website/versioned_docs/version-2.9.x/security-overview.md b/site2/website/versioned_docs/version-2.9.x/security-overview.md new file mode 100644 index 0000000000000..227dd5b5d4fc3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-overview.md @@ -0,0 +1,36 @@ +--- +id: security-overview +title: Pulsar security overview +sidebar_label: "Overview" +original_id: security-overview +--- + +As the central message bus for a business, Apache Pulsar is frequently used for storing mission-critical data. Therefore, enabling security features in Pulsar is crucial. + +By default, Pulsar configures no encryption, authentication, or authorization. Any client can communicate to Apache Pulsar via plain text service URLs. So we must ensure that Pulsar accessing via these plain text service URLs is restricted to trusted clients only. In such cases, you can use Network segmentation and/or authorization ACLs to restrict access to trusted IPs. If you use neither, the state of cluster is wide open and anyone can access the cluster. + +Pulsar supports a pluggable authentication mechanism. And Pulsar clients use this mechanism to authenticate with brokers and proxies. You can also configure Pulsar to support multiple authentication sources. + +The Pulsar broker validates the authentication credentials when a connection is established. After the initial connection is authenticated, the "principal" token is stored for authorization though the connection is not re-authenticated. The broker periodically checks the expiration status of every `ServerCnx` object. You can set the `authenticationRefreshCheckSeconds` on the broker to control the frequency to check the expiration status. By default, the `authenticationRefreshCheckSeconds` is set to 60s. When the authentication is expired, the broker forces to re-authenticate the connection. If the re-authentication fails, the broker disconnects the client. + +The broker supports learning whether a particular client supports authentication refreshing. If a client supports authentication refreshing and the credential is expired, the authentication provider calls the `refreshAuthentication` method to initiate the refreshing process. If a client does not support authentication refreshing and the credential is expired, the broker disconnects the client. + +You had better secure the service components in your Apache Pulsar deployment. + +## Role tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, which can represent a single client or multiple clients. You can use roles to control permission for clients to produce or consume from certain topics, administer the configuration for tenants, and so on. + +Apache Pulsar uses a [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign a *role token* to that client. This role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication providers + +Currently Pulsar supports the following authentication providers: + +- [TLS Authentication](security-tls-authentication.md) +- [Athenz](security-athenz.md) +- [Kerberos](security-kerberos.md) +- [JSON Web Token Authentication](security-jwt.md) +- [OAuth 2.0 authentication](security-oauth2.md) +- [HTTP basic authentication](security-basic-auth.md) + diff --git a/site2/website/versioned_docs/version-2.9.x/security-tls-authentication.md b/site2/website/versioned_docs/version-2.9.x/security-tls-authentication.md new file mode 100644 index 0000000000000..85d2240f41306 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-tls-authentication.md @@ -0,0 +1,222 @@ +--- +id: security-tls-authentication +title: Authentication using TLS +sidebar_label: "Authentication using TLS" +original_id: security-tls-authentication +--- + +## TLS authentication overview + +TLS authentication is an extension of [TLS transport encryption](security-tls-transport.md). Not only servers have keys and certs that the client uses to verify the identity of servers, clients also have keys and certs that the server uses to verify the identity of clients. You must have TLS transport encryption configured on your cluster before you can use TLS authentication. This guide assumes you already have TLS transport encryption configured. + +`Bouncy Castle Provider` provides TLS related cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +### Create client certificates + +Client certificates are generated using the certificate authority. Server certificates are also generated with the same certificate authority. + +The biggest difference between client certs and server certs is that the **common name** for the client certificate is the **role token** which that client is authenticated as. + +To use client certificates, you need to set `tlsRequireTrustedClientCertOnConnect=true` at the broker side. For details, refer to [TLS broker configuration](security-tls-transport.md#configure-broker). + +First, you need to enter the following command to generate the key : + +```bash + +$ openssl genrsa -out admin.key.pem 2048 + +``` + +Similar to the broker, the client expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so you need to convert it by entering the following command: + +```bash + +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in admin.key.pem -out admin.key-pk8.pem -nocrypt + +``` + +Next, enter the command below to generate the certificate request. When you are asked for a **common name**, enter the **role token** that you want this key pair to authenticate a client as. + +```bash + +$ openssl req -config openssl.cnf \ + -key admin.key.pem -new -sha256 -out admin.csr.pem + +``` + +:::note + +If openssl.cnf is not specified, read [Certificate authority](http://pulsar.apache.org/docs/en/security-tls-transport/#certificate-authority) to get the openssl.cnf. + +::: + +Then, enter the command below to sign with request with the certificate authority. Note that the client certs uses the **usr_cert** extension, which allows the cert to be used for client authentication. + +```bash + +$ openssl ca -config openssl.cnf -extensions usr_cert \ + -days 1000 -notext -md sha256 \ + -in admin.csr.pem -out admin.cert.pem + +``` + +You can get a cert, `admin.cert.pem`, and a key, `admin.key-pk8.pem` from this command. With `ca.cert.pem`, clients can use this cert and this key to authenticate themselves to brokers and proxies as the role token ``admin``. + +:::note + +If the "unable to load CA private key" error occurs and the reason of this error is "No such file or directory: /etc/pki/CA/private/cakey.pem" in this step. Try the command below: + +```bash + +$ cd /etc/pki/tls/misc/CA +$ ./CA -newca + +``` + +to generate `cakey.pem` . + +::: + +## Enable TLS authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#broker-configuration): + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# operations and publish/consume from all topics +superUserRoles=admin + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters={"tlsCertFile":"/path/my-ca/admin.cert.pem","tlsKeyFile":"/path/my-ca/admin.key-pk8.pem"} +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem + +``` + +## Enable TLS authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#proxy-configuration): + +The proxy should have its own client key pair for connecting to brokers. You need to configure the role token for this key pair in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/to/proxy.cert.pem,tlsKeyFile:/path/to/proxy.key-pk8.pem + +``` + +## Client configuration + +When you use TLS authentication, client connects via TLS transport. You need to configure the client to use ```https://``` and 8443 port for the web service URL, ```pulsar+ssl://``` and 6651 port for the broker service URL. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS authentication with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem + +``` + +### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .authentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", + "tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + .build(); + +``` + +### Python client + +```python + +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) + +``` + +### C++ client + +```c++ + +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); + +pulsar::AuthenticationPtr auth = pulsar::AuthTls::create("/path/to/my-role.cert.pem", + "/path/to/my-role.key-pk8.pem") +config.setAuth(auth); + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); + +``` + +### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const auth = new Pulsar.AuthenticationTls({ + certificatePath: '/path/to/my-role.cert.pem', + privateKeyPath: '/path/to/my-role.key-pk8.pem', + }); + + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + authentication: auth, + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + }); +})(); + +``` + +### C# client + +```c# + +var clientCertificate = new X509Certificate2("admin.pfx"); +var client = PulsarClient.Builder() + .AuthenticateUsingClientCertificate(clientCertificate) + .Build(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/security-tls-keystore.md b/site2/website/versioned_docs/version-2.9.x/security-tls-keystore.md new file mode 100644 index 0000000000000..0b3b50fcebb10 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-tls-keystore.md @@ -0,0 +1,342 @@ +--- +id: security-tls-keystore +title: Using TLS with KeyStore configure +sidebar_label: "Using TLS with KeyStore configure" +original_id: security-tls-keystore +--- + +## Overview + +Apache Pulsar supports [TLS encryption](security-tls-transport.md) and [TLS authentication](security-tls-authentication.md) between clients and Apache Pulsar service. +By default it uses PEM format file configuration. This page tries to describe use [KeyStore](https://en.wikipedia.org/wiki/Java_KeyStore) type configure for TLS. + + +## TLS encryption with KeyStore configure + +### Generate TLS key and certificate + +The first step of deploying TLS is to generate the key and the certificate for each machine in the cluster. +You can use Java’s `keytool` utility to accomplish this task. We will generate the key into a temporary keystore +initially for broker, so that we can export and sign it later with CA. + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -validity {validity} -genkeypair -keyalg RSA + +``` + +You need to specify two parameters in the above command: + +1. `keystore`: the keystore file that stores the certificate. The *keystore* file contains the private key of + the certificate; hence, it needs to be kept safely. +2. `validity`: the valid time of the certificate in days. + +> Ensure that common name (CN) matches exactly with the fully qualified domain name (FQDN) of the server. +The client compares the CN with the DNS domain name to ensure that it is indeed connecting to the desired server, not a malicious one. + +### Creating your own CA + +After the first step, each broker in the cluster has a public-private key pair, and a certificate to identify the machine. +The certificate, however, is unsigned, which means that an attacker can create such a certificate to pretend to be any machine. + +Therefore, it is important to prevent forged certificates by signing them for each machine in the cluster. +A `certificate authority (CA)` is responsible for signing certificates. CA works likes a government that issues passports — +the government stamps (signs) each passport so that the passport becomes difficult to forge. Other governments verify the stamps +to ensure the passport is authentic. Similarly, the CA signs the certificates, and the cryptography guarantees that a signed +certificate is computationally difficult to forge. Thus, as long as the CA is a genuine and trusted authority, the clients have +high assurance that they are connecting to the authentic machines. + +```shell + +openssl req -new -x509 -keyout ca-key -out ca-cert -days 365 + +``` + +The generated CA is simply a *public-private* key pair and certificate, and it is intended to sign other certificates. + +The next step is to add the generated CA to the clients' truststore so that the clients can trust this CA: + +```shell + +keytool -keystore client.truststore.jks -alias CARoot -import -file ca-cert + +``` + +NOTE: If you configure the brokers to require client authentication by setting `tlsRequireTrustedClientCertOnConnect` to `true` on the +broker configuration, then you must also provide a truststore for the brokers and it should have all the CA certificates that clients keys were signed by. + +```shell + +keytool -keystore broker.truststore.jks -alias CARoot -import -file ca-cert + +``` + +In contrast to the keystore, which stores each machine’s own identity, the truststore of a client stores all the certificates +that the client should trust. Importing a certificate into one’s truststore also means trusting all certificates that are signed +by that certificate. As the analogy above, trusting the government (CA) also means trusting all passports (certificates) that +it has issued. This attribute is called the chain of trust, and it is particularly useful when deploying TLS on a large BookKeeper cluster. +You can sign all certificates in the cluster with a single CA, and have all machines share the same truststore that trusts the CA. +That way all machines can authenticate all other machines. + + +### Signing the certificate + +The next step is to sign all certificates in the keystore with the CA we generated. First, you need to export the certificate from the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias localhost -certreq -file cert-file + +``` + +Then sign it with the CA: + +```shell + +openssl x509 -req -CA ca-cert -CAkey ca-key -in cert-file -out cert-signed -days {validity} -CAcreateserial -passin pass:{ca-password} + +``` + +Finally, you need to import both the certificate of the CA and the signed certificate into the keystore: + +```shell + +keytool -keystore broker.keystore.jks -alias CARoot -import -file ca-cert +keytool -keystore broker.keystore.jks -alias localhost -import -file cert-signed + +``` + +The definitions of the parameters are the following: + +1. `keystore`: the location of the keystore +2. `ca-cert`: the certificate of the CA +3. `ca-key`: the private key of the CA +4. `ca-password`: the passphrase of the CA +5. `cert-file`: the exported, unsigned certificate of the broker +6. `cert-signed`: the signed certificate of the broker + +### Configuring brokers + +Brokers enable TLS by provide valid `brokerServicePortTls` and `webServicePortTls`, and also need set `tlsEnabledWithKeyStore` to `true` for using KeyStore type configuration. +Besides this, KeyStore path, KeyStore password, TrustStore path, and TrustStore password need to provided. +And since broker will create internal client/admin client to communicate with other brokers, user also need to provide config for them, this is similar to how user config the outside client/admin-client. +If `tlsRequireTrustedClientCertOnConnect` is `true`, broker will reject the Connection if the Client Certificate is not trusted. + +The following TLS configs are needed on the broker side: + +```properties + +tlsEnabledWithKeyStore=true +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw + +``` + +NOTE: it is important to restrict access to the store files via filesystem permissions. + +If you have configured TLS on the broker, to disable non-TLS ports, you can set the values of the following configurations to empty as below. + +``` + +brokerServicePort= +webServicePort= + +``` + +In this case, you need to set the following configurations. + +```conf + +brokerClientTlsEnabled=true // Set this to true +brokerClientTlsEnabledWithKeyStore=true // Set this to true +brokerClientTlsTrustStore= // Set this to your desired value +brokerClientTlsTrustStorePassword= // Set this to your desired value + +Optional settings that may worth consider: + +1. tlsClientAuthentication=false: Enable/Disable using TLS for authentication. This config when enabled will authenticate the other end + of the communication channel. It should be enabled on both brokers and clients for mutual TLS. +2. tlsCiphers=[TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256], A cipher suite is a named combination of authentication, encryption, MAC and key exchange + algorithm used to negotiate the security settings for a network connection using TLS network protocol. By default, + it is null. [OpenSSL Ciphers](https://www.openssl.org/docs/man1.0.2/apps/ciphers.html) + [JDK Ciphers](http://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#ciphersuites) +3. tlsProtocols=[TLSv1.3,TLSv1.2] (list out the TLS protocols that you are going to accept from clients). + By default, it is not set. + +``` + +### Configuring Clients + +This is similar to [TLS encryption configuing for client with PEM type](security-tls-transport.md#Client configuration). +For a a minimal configuration, user need to provide the TrustStore information. + +e.g. +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + + ``` + +1. for java admin client + +```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .build(); + +``` + +## TLS authentication with KeyStore configure + +This similar to [TLS authentication with PEM type](security-tls-authentication.md) + +### broker authentication config + +`broker.conf` + +```properties + +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# this should be the CN for one of client keystore. +superUserRoles=admin + +# Enable KeyStore type +tlsEnabledWithKeyStore=true +requireTrustedClientCertOnConnect=true + +# key store +tlsKeyStoreType=JKS +tlsKeyStore=/var/private/tls/broker.keystore.jks +tlsKeyStorePassword=brokerpw + +# trust store +tlsTrustStoreType=JKS +tlsTrustStore=/var/private/tls/broker.truststore.jks +tlsTrustStorePassword=brokerpw + +# internal client/admin-client config +brokerClientTlsEnabled=true +brokerClientTlsEnabledWithKeyStore=true +brokerClientTlsTrustStoreType=JKS +brokerClientTlsTrustStore=/var/private/tls/client.truststore.jks +brokerClientTlsTrustStorePassword=clientpw +# internal auth config +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls +brokerClientAuthenticationParameters={"keyStoreType":"JKS","keyStorePath":"/var/private/tls/client.keystore.jks","keyStorePassword":"clientpw"} +# currently websocket not support keystore type +webSocketServiceEnabled=false + +``` + +### client authentication configuring + +Besides the TLS encryption configuring. The main work is configuring the KeyStore, which contains a valid CN as client role, for client. + +e.g. +1. for [Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + + ```properties + + webServiceUrl=https://broker.example.com:8443/ + brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ + useKeyStoreTls=true + tlsTrustStoreType=JKS + tlsTrustStorePath=/var/private/tls/client.truststore.jks + tlsTrustStorePassword=clientpw + authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls + authParams={"keyStoreType":"JKS","keyStorePath":"/path/to/keystorefile","keyStorePassword":"keystorepw"} + + ``` + +1. for java client + + ```java + + import org.apache.pulsar.client.api.PulsarClient; + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +1. for java admin client + + ```java + + PulsarAdmin amdin = PulsarAdmin.builder().serviceHttpUrl("https://broker.example.com:8443") + .useKeyStoreTls(true) + .tlsTrustStorePath("/var/private/tls/client.truststore.jks") + .tlsTrustStorePassword("clientpw") + .allowTlsInsecureConnection(false) + .authentication( + "org.apache.pulsar.client.impl.auth.AuthenticationKeyStoreTls", + "keyStoreType:JKS,keyStorePath:/var/private/tls/client.keystore.jks,keyStorePassword:clientpw") + .build(); + + ``` + +## Enabling TLS Logging + +You can enable TLS debug logging at the JVM level by starting the brokers and/or clients with `javax.net.debug` system property. For example: + +```shell + +-Djavax.net.debug=all + +``` + +You can find more details on this in [Oracle documentation](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html) on [debugging SSL/TLS connections](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/ReadDebug.html). diff --git a/site2/website/versioned_docs/version-2.9.x/security-tls-transport.md b/site2/website/versioned_docs/version-2.9.x/security-tls-transport.md new file mode 100644 index 0000000000000..2cad17a78c350 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-tls-transport.md @@ -0,0 +1,295 @@ +--- +id: security-tls-transport +title: Transport Encryption using TLS +sidebar_label: "Transport Encryption using TLS" +original_id: security-tls-transport +--- + +## TLS overview + +By default, Apache Pulsar clients communicate with the Apache Pulsar service in plain text. This means that all data is sent in the clear. You can use TLS to encrypt this traffic to protect the traffic from the snooping of a man-in-the-middle attacker. + +You can also configure TLS for both encryption and authentication. Use this guide to configure just TLS transport encryption and refer to [here](security-tls-authentication.md) for TLS authentication configuration. Alternatively, you can use [another authentication mechanism](security-athenz.md) on top of TLS transport encryption. + +> Note that enabling TLS may impact the performance due to encryption overhead. + +## TLS concepts + +TLS is a form of [public key cryptography](https://en.wikipedia.org/wiki/Public-key_cryptography). Using key pairs consisting of a public key and a private key can perform the encryption. The public key encrpyts the messages and the private key decrypts the messages. + +To use TLS transport encryption, you need two kinds of key pairs, **server key pairs** and a **certificate authority**. + +You can use a third kind of key pair, **client key pairs**, for [client authentication](security-tls-authentication.md). + +You should store the **certificate authority** private key in a very secure location (a fully encrypted, disconnected, air gapped computer). As for the certificate authority public key, the **trust cert**, you can freely shared it. + +For both client and server key pairs, the administrator first generates a private key and a certificate request, then uses the certificate authority private key to sign the certificate request, finally generates a certificate. This certificate is the public key for the server/client key pair. + +For TLS transport encryption, the clients can use the **trust cert** to verify that the server has a key pair that the certificate authority signed when the clients are talking to the server. A man-in-the-middle attacker does not have access to the certificate authority, so they couldn't create a server with such a key pair. + +For TLS authentication, the server uses the **trust cert** to verify that the client has a key pair that the certificate authority signed. The common name of the **client cert** is then used as the client's role token (see [Overview](security-overview.md)). + +`Bouncy Castle Provider` provides cipher suites and algorithms in Pulsar. If you need [FIPS](https://www.bouncycastle.org/fips_faq.html) version of `Bouncy Castle Provider`, please reference [Bouncy Castle page](security-bouncy-castle.md). + +## Create TLS certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [server certificate](#server-certificate), and [client certificate](#client-certificate). + +Follow the guide below to set up a certificate authority. You can also refer to plenty of resources on the internet for more details. We recommend [this guide](https://jamielinux.com/docs/openssl-certificate-authority/index.html) for your detailed reference. + +### Certificate authority + +1. Create the certificate for the CA. You can use CA to sign both the broker and client certificates. This ensures that each party will trust the others. You should store CA in a very secure location (ideally completely disconnected from networks, air gapped, and fully encrypted). + +2. Entering the following command to create a directory for your CA, and place [this openssl configuration file](https://github.com/apache/pulsar/tree/master/site2/website/static/examples/openssl.cnf) in the directory. You may want to modify the default answers for company name and department in the configuration file. Export the location of the CA directory to the environment variable, CA_HOME. The configuration file uses this environment variable to find the rest of the files and directories that the CA needs. + +```bash + +mkdir my-ca +cd my-ca +wget https://raw.githubusercontent.com/apache/pulsar-site/main/site2/website/static/examples/openssl.cnf +export CA_HOME=$(pwd) + +``` + +3. Enter the commands below to create the necessary directories, keys and certs. + +```bash + +mkdir certs crl newcerts private +chmod 700 private/ +touch index.txt +echo 1000 > serial +openssl genrsa -aes256 -out private/ca.key.pem 4096 +chmod 400 private/ca.key.pem +openssl req -config openssl.cnf -key private/ca.key.pem \ + -new -x509 -days 7300 -sha256 -extensions v3_ca \ + -out certs/ca.cert.pem +chmod 444 certs/ca.cert.pem + +``` + +4. After you answer the question prompts, CA-related files are stored in the `./my-ca` directory. Within that directory: + +* `certs/ca.cert.pem` is the public certificate. This public certificates is meant to be distributed to all parties involved. +* `private/ca.key.pem` is the private key. You only need it when you are signing a new certificate for either broker or clients and you must safely guard this private key. + +### Server certificate + +Once you have created a CA certificate, you can create certificate requests and sign them with the CA. + +The following commands ask you a few questions and then create the certificates. When you are asked for the common name, you should match the hostname of the broker. You can also use a wildcard to match a group of broker hostnames, for example, `*.broker.usw.example.com`. This ensures that multiple machines can reuse the same certificate. + +:::tip + +Sometimes matching the hostname is not possible or makes no sense, +such as when you create the brokers with random hostnames, or you +plan to connect to the hosts via their IP. In these cases, you +should configure the client to disable TLS hostname verification. For more +details, you can see [the host verification section in client configuration](#hostname-verification). + +::: + +1. Enter the command below to generate the key. + +```bash + +openssl genrsa -out broker.key.pem 2048 + +``` + +The broker expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so enter the following command to convert it. + +```bash + +openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in broker.key.pem -out broker.key-pk8.pem -nocrypt + +``` + +2. Enter the following command to generate the certificate request. + +```bash + +openssl req -config openssl.cnf \ + -key broker.key.pem -new -sha256 -out broker.csr.pem + +``` + +3. Sign it with the certificate authority by entering the command below. + +```bash + +openssl ca -config openssl.cnf -extensions server_cert \ + -days 1000 -notext -md sha256 \ + -in broker.csr.pem -out broker.cert.pem + +``` + +At this point, you have a cert, `broker.cert.pem`, and a key, `broker.key-pk8.pem`, which you can use along with `ca.cert.pem` to configure TLS transport encryption for your broker and proxy nodes. + +## Configure broker + +To configure a Pulsar [broker](reference-terminology.md#broker) to use TLS transport encryption, you need to make some changes to `broker.conf`, which locates in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties + +tlsEnabled=true +tlsRequireTrustedClientCertOnConnect=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +> You can find a full list of parameters available in the `conf/broker.conf` file, +> as well as the default values for those parameters, in [Broker Configuration](reference-configuration.md#broker) +> +### TLS Protocol Version and Cipher + +You can configure the broker (and proxy) to require specific TLS protocol versions and ciphers for TLS negiotation. You can use the TLS protocol versions and ciphers to stop clients from requesting downgraded TLS protocol versions or ciphers that may have weaknesses. + +Both the TLS protocol versions and cipher properties can take multiple values, separated by commas. The possible values for protocol version and ciphers depend on the TLS provider that you are using. Pulsar uses OpenSSL if the OpenSSL is available, but if the OpenSSL is not available, Pulsar defaults back to the JDK implementation. + +```properties + +tlsProtocols=TLSv1.3,TLSv1.2 +tlsCiphers=TLS_DH_RSA_WITH_AES_256_GCM_SHA384,TLS_DH_RSA_WITH_AES_256_CBC_SHA + +``` + +OpenSSL currently supports ```TLSv1.1```, ```TLSv1.2``` and ```TLSv1.3``` for the protocol version. You can acquire a list of supported cipher from the openssl ciphers command, i.e. ```openssl ciphers -tls1_3```. + +For JDK 11, you can obtain a list of supported values from the documentation: +- [TLS protocol](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSEPROVIDERPROTOCOLPARAMETERS-BBF75009) +- [Ciphers](https://docs.oracle.com/en/java/javase/11/security/oracle-providers.html#GUID-7093246A-31A3-4304-AC5F-5FB6400405E2__SUNJSSE_CIPHER_SUITES) + +## Proxy Configuration + +Proxies need to configure TLS in two directions, for clients connecting to the proxy, and for the proxy connecting to brokers. + +```properties + +# For clients connecting to the proxy +tlsEnabledInProxy=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +# For the proxy to connect to brokers +tlsEnabledWithBroker=true +brokerClientTrustCertsFilePath=/path/to/ca.cert.pem + +``` + +## Client configuration + +When you enable the TLS transport encryption, you need to configure the client to use ```https://``` and port 8443 for the web service URL, and ```pulsar+ssl://``` and port 6651 for the broker service URL. + +As the server certificate that you generated above does not belong to any of the default trust chains, you also need to either specify the path the **trust cert** (recommended), or tell the client to allow untrusted server certs. + +### Hostname verification + +Hostname verification is a TLS security feature whereby a client can refuse to connect to a server if the "CommonName" does not match the hostname to which the hostname is connecting. By default, Pulsar clients disable hostname verification, as it requires that each broker has a DNS record and a unique cert. + +Moreover, as the administrator has full control of the certificate authority, a bad actor is unlikely to be able to pull off a man-in-the-middle attack. "allowInsecureConnection" allows the client to connect to servers whose cert has not been signed by an approved CA. The client disables "allowInsecureConnection" by default, and you should always disable "allowInsecureConnection" in production environments. As long as you disable "allowInsecureConnection", a man-in-the-middle attack requires that the attacker has access to the CA. + +One scenario where you may want to enable hostname verification is where you have multiple proxy nodes behind a VIP, and the VIP has a DNS record, for example, pulsar.mycompany.com. In this case, you can generate a TLS cert with pulsar.mycompany.com as the "CommonName," and then enable hostname verification on the client. + +The examples below show that hostname verification is disabled for the CLI tools/Java/Python/C++/Node.js/C# clients by default. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools.md#pulsar-admin), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS transport with the CLI tools of Pulsar: + +```properties + +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +tlsEnableHostnameVerification=false + +``` + +#### Java client + +```java + +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .enableTlsHostnameVerification(false) // false by default, in any case + .allowTlsInsecureConnection(false) // false by default, in any case + .build(); + +``` + +#### Python client + +```python + +from pulsar import Client + +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_hostname_verification=False, + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False) // defaults to false from v2.2.0 onwards + +``` + +#### C++ client + +```c++ + +#include + +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); // shouldn't be needed soon +config.setTlsTrustCertsFilePath(caPath); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create(clientPublicKeyPath, clientPrivateKeyPath)); +config.setValidateHostName(false); + +``` + +#### Node.js client + +```JavaScript + +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + useTls: true, + tlsValidateHostname: false, + tlsAllowInsecureConnection: false, + }); +})(); + +``` + +#### C# client + +```c# + +var certificate = new X509Certificate2("ca.cert.pem"); +var client = PulsarClient.Builder() + .TrustedCertificateAuthority(certificate) //If the CA is not trusted on the host, you can add it explicitly. + .VerifyCertificateAuthority(true) //Default is 'true' + .VerifyCertificateName(false) //Default is 'false' + .Build(); + +``` + +> Note that `VerifyCertificateName` refers to the configuration of hostname verification in the C# client. diff --git a/site2/website/versioned_docs/version-2.9.x/security-token-admin.md b/site2/website/versioned_docs/version-2.9.x/security-token-admin.md new file mode 100644 index 0000000000000..a265f6320d28f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/security-token-admin.md @@ -0,0 +1,183 @@ +--- +id: security-token-admin +title: Token authentication admin +sidebar_label: "Token authentication admin" +original_id: security-token-admin +--- + +## Token Authentication Overview + +Pulsar supports authenticating clients using security tokens that are based on [JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +Tokens are used to identify a Pulsar client and associate with some "principal" (or "role") which +will be then granted permissions to do some actions (eg: publish or consume from a topic). + +A user will typically be given a token string by an administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like: + +``` + + eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + +``` + +Application will specify the token when creating the client instance. An alternative is to pass +a "token supplier", that is to say a function that returns the token when the client library +will need one. + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. It is strongly recommended to +> always use TLS encryption when talking to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) + +## Secret vs Public/Private keys + +JWT support two different kind of keys in order to generate and validate the tokens: + + * Symmetric : + - there is a single ***Secret*** key that is used both to generate and validate + * Asymmetric: there is a pair of keys. + - ***Private*** key is used to generate tokens + - ***Public*** key is used to validate tokens + +### Secret key + +When using a secret key, the administrator will create the key and he will +use it to generate the client tokens. This key will be also configured to +the brokers to allow them to validate the clients. + +#### Creating a secret key + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-secret-key --output my-secret.key + +``` + +To generate base64 encoded private key + +```shell + +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 + +``` + +### Public/Private keys + +With public/private, we need to create a pair of keys. Pulsar supports all algorithms supported by the Java JWT library shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys) + +#### Creating a key pair + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. + +```shell + +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key + +``` + + * `my-private.key` will be stored in a safe location and only used by administrator to generate + new tokens. + * `my-public.key` will be distributed to all Pulsar brokers. This file can be publicly shared without + any security concern. + +## Generating tokens + +A token is the credential associated with a user. The association is done through the "principal", +or "role". In case of JWT tokens, this field it's typically referred to as **subject**, though +it's exactly the same concept. + +The generated token is then required to have a **subject** field set. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user + +``` + +This will print the token string on stdout. + +Similarly, one can create a token by passing the "private" key: + +```shell + +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user + +``` + +Finally, a token can also be created with a pre-defined TTL. After that time, +the token will be automatically invalidated. + +```shell + +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y + +``` + +## Authorization + +The token itself doesn't have any permission associated. That will be determined by the +authorization engine. Once the token is created, one can grant permission for this token to do certain +actions. Eg. : + +```shell + +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume + +``` + +## Enabling Token Authentication ... + +### ... on Brokers + +To configure brokers to authenticate clients, put the following in `broker.conf`: + +```properties + +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# If using secret key (Note: key files must be DER-encoded) +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:;base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private (Note: key files must be DER-encoded) +# tokenPublicKey=file:///path/to/public.key + +``` + +### ... on Proxies + +To configure proxies to authenticate clients, put the following in `proxy.conf`: + +The proxy will have its own token used when talking to brokers. The role token for this +key pair should be configured in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties + +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters={"token":"eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw"} +# Or, alternatively, read token from file +# brokerClientAuthenticationParameters=file:///path/to/proxy-token.txt + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/sql-deployment-configurations.md b/site2/website/versioned_docs/version-2.9.x/sql-deployment-configurations.md new file mode 100644 index 0000000000000..43a806e818b7a --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/sql-deployment-configurations.md @@ -0,0 +1,208 @@ +--- +id: sql-deployment-configurations +title: Pulsar SQL configuration and deployment +sidebar_label: "Configuration and deployment" +original_id: sql-deployment-configurations +--- + +You can configure Presto Pulsar connector and deploy a cluster with the following instruction. + +## Configure Presto Pulsar Connector +You can configure Presto Pulsar Connector in the `${project.root}/conf/presto/catalog/pulsar.properties` properties file. The configuration for the connector and the default values are as follows. + +```properties + +# name of the connector to be displayed in the catalog +connector.name=pulsar + +# the url of Pulsar broker service +pulsar.web-service-url=http://localhost:8080 + +# URI of Zookeeper cluster +pulsar.zookeeper-uri=localhost:2181 + +# minimum number of entries to read at a single time +pulsar.entry-read-batch-size=100 + +# default number of splits to use per query +pulsar.target-num-splits=4 + +# max size of one batch message (default value is 5MB) +pulsar.max-message-size=5242880 + +``` + +:::note + +`pulsar.max-message-size` is only available in 2.9.2 and later versions. + +::: + +You can connect Presto to a Pulsar cluster with multiple hosts. To configure multiple hosts for brokers, add multiple URLs to `pulsar.web-service-url`. To configure multiple hosts for ZooKeeper, add multiple URIs to `pulsar.zookeeper-uri`. The following is an example. + +``` + +pulsar.web-service-url=http://localhost:8080,localhost:8081,localhost:8082 +pulsar.zookeeper-uri=localhost1,localhost2:2181 + +``` + +**Note: by default, Pulsar SQL does not get the last message in a topic**. It is by design and controlled by settings. By default, BookKeeper LAC only advances when subsequent entries are added. If there is no subsequent entry added, the last written entry is not visible to readers until the ledger is closed. This is not a problem for Pulsar which uses managed ledger, but Pulsar SQL directly reads from BookKeeper ledger. + +If you want to get the last message in a topic, set the following configurations: + +1. For the broker configuration, set `bookkeeperExplicitLacIntervalInMills` > 0 in `broker.conf` or `standalone.conf`. + +2. For the Presto configuration, set `pulsar.bookkeeper-explicit-interval` > 0 and `pulsar.bookkeeper-use-v2-protocol=false`. + +However, using BookKeeper V3 protocol introduces additional GC overhead to BK as it uses Protobuf. + +## Query data from existing Presto clusters + +If you already have a Presto cluster, you can copy the Presto Pulsar connector plugin to your existing cluster. Download the archived plugin package with the following command. + +```bash + +$ wget pulsar:binary_release_url + +``` + +## Deploy a new cluster + +Since Pulsar SQL is powered by [Trino (formerly Presto SQL)](https://trino.io), the configuration for deployment is the same for the Pulsar SQL worker. + +:::note + +For how to set up a standalone single node environment, refer to [Query data](sql-getting-started.md). + +::: + +You can use the same CLI args as the Presto launcher. + +```bash + +$ ./bin/pulsar sql-worker --help +Usage: launcher [options] command + +Commands: run, start, stop, restart, kill, status + +Options: + -h, --help show this help message and exit + -v, --verbose Run verbosely + --etc-dir=DIR Defaults to INSTALL_PATH/etc + --launcher-config=FILE + Defaults to INSTALL_PATH/bin/launcher.properties + --node-config=FILE Defaults to ETC_DIR/node.properties + --jvm-config=FILE Defaults to ETC_DIR/jvm.config + --config=FILE Defaults to ETC_DIR/config.properties + --log-levels-file=FILE + Defaults to ETC_DIR/log.properties + --data-dir=DIR Defaults to INSTALL_PATH + --pid-file=FILE Defaults to DATA_DIR/var/run/launcher.pid + --launcher-log-file=FILE + Defaults to DATA_DIR/var/log/launcher.log (only in + daemon mode) + --server-log-file=FILE + Defaults to DATA_DIR/var/log/server.log (only in + daemon mode) + -D NAME=VALUE Set a Java system property + +``` + +The default configuration for the cluster is located in `${project.root}/conf/presto`. You can customize your deployment by modifying the default configuration. + +You can set the worker to read from a different configuration directory, or set a different directory to write data. + +```bash + +$ ./bin/pulsar sql-worker run --etc-dir /tmp/incubator-pulsar/conf/presto --data-dir /tmp/presto-1 + +``` + +You can start the worker as daemon process. + +```bash + +$ ./bin/pulsar sql-worker start + +``` + +### Deploy a cluster on multiple nodes + +You can deploy a Pulsar SQL cluster or Presto cluster on multiple nodes. The following example shows how to deploy a cluster on three-node cluster. + +1. Copy the Pulsar binary distribution to three nodes. + +The first node runs as Presto coordinator. The minimal configuration requirement in the `${project.root}/conf/presto/config.properties` file is as follows. + +```properties + +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery-server.enabled=true +discovery.uri= + +``` + +The other two nodes serve as worker nodes, you can use the following configuration for worker nodes. + +```properties + +coordinator=false +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery.uri= + +``` + +2. Modify `pulsar.web-service-url` and `pulsar.zookeeper-uri` configuration in the `${project.root}/conf/presto/catalog/pulsar.properties` file accordingly for the three nodes. + +3. Start the coordinator node. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +4. Start worker nodes. + +``` + +$ ./bin/pulsar sql-worker run + +``` + +5. Start the SQL CLI and check the status of your cluster. + +```bash + +$ ./bin/pulsar sql --server + +``` + +6. Check the status of your nodes. + +```bash + +presto> SELECT * FROM system.runtime.nodes; + node_id | http_uri | node_version | coordinator | state +---------+-------------------------+--------------+-------------+-------- + 1 | http://192.168.2.1:8081 | testversion | true | active + 3 | http://192.168.2.2:8081 | testversion | false | active + 2 | http://192.168.2.3:8081 | testversion | false | active + +``` + +For more information about deployment in Presto, refer to [Presto deployment](https://trino.io/docs/current/installation/deployment.html). + +:::note + +The broker does not advance LAC, so when Pulsar SQL bypass broker to query data, it can only read entries up to the LAC that all the bookies learned. You can enable periodically write LAC on the broker by setting "bookkeeperExplicitLacIntervalInMills" in the broker.conf. + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/sql-getting-started.md b/site2/website/versioned_docs/version-2.9.x/sql-getting-started.md new file mode 100644 index 0000000000000..8a5cd7199b365 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/sql-getting-started.md @@ -0,0 +1,187 @@ +--- +id: sql-getting-started +title: Query data with Pulsar SQL +sidebar_label: "Query data" +original_id: sql-getting-started +--- + +Before querying data in Pulsar, you need to install Pulsar and built-in connectors. + +## Requirements +1. Install [Pulsar](getting-started-standalone.md#install-pulsar-standalone). +2. Install Pulsar [built-in connectors](getting-started-standalone.md#install-builtin-connectors-optional). + +## Query data in Pulsar +To query data in Pulsar with Pulsar SQL, complete the following steps. + +1. Start a Pulsar standalone cluster. + +```bash + +./bin/pulsar standalone + +``` + +2. Start a Pulsar SQL worker. + +```bash + +./bin/pulsar sql-worker run + +``` + +3. After initializing Pulsar standalone cluster and the SQL worker, run SQL CLI. + +```bash + +./bin/pulsar sql + +``` + +4. Test with SQL commands. + +```bash + +presto> show catalogs; + Catalog +--------- + pulsar + system +(2 rows) + +Query 20180829_211752_00004_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + + +presto> show schemas in pulsar; + Schema +----------------------- + information_schema + public/default + public/functions + sample/standalone/ns1 +(4 rows) + +Query 20180829_211818_00005_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [4 rows, 89B] [21 rows/s, 471B/s] + + +presto> show tables in pulsar."public/default"; + Table +------- +(0 rows) + +Query 20180829_211839_00006_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + +``` + +Since there is no data in Pulsar, no records is returned. + +5. Start the built-in connector _DataGeneratorSource_ and ingest some mock data. + +```bash + +./bin/pulsar-admin sources create --name generator --destinationTopicName generator_test --source-type data-generator + +``` + +And then you can query a topic in the namespace "public/default". + +```bash + +presto> show tables in pulsar."public/default"; + Table +---------------- + generator_test +(1 row) + +Query 20180829_213202_00000_csyeu, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:02 [1 rows, 38B] [0 rows/s, 17B/s] + +``` + +You can now query the data within the topic "generator_test". + +```bash + +presto> select * from pulsar."public/default".generator_test; + + firstname | middlename | lastname | email | username | password | telephonenumber | age | companyemail | nationalidentitycardnumber | +-------------+-------------+-------------+----------------------------------+--------------+----------+-----------------+-----+-----------------------------------------------+----------------------------+ + Genesis | Katherine | Wiley | genesis.wiley@gmail.com | genesisw | y9D2dtU3 | 959-197-1860 | 71 | genesis.wiley@interdemconsulting.eu | 880-58-9247 | + Brayden | | Stanton | brayden.stanton@yahoo.com | braydens | ZnjmhXik | 220-027-867 | 81 | brayden.stanton@supermemo.eu | 604-60-7069 | + Benjamin | Julian | Velasquez | benjamin.velasquez@yahoo.com | benjaminv | 8Bc7m3eb | 298-377-0062 | 21 | benjamin.velasquez@hostesltd.biz | 213-32-5882 | + Michael | Thomas | Donovan | donovan@mail.com | michaeld | OqBm9MLs | 078-134-4685 | 55 | michael.donovan@memortech.eu | 443-30-3442 | + Brooklyn | Avery | Roach | brooklynroach@yahoo.com | broach | IxtBLafO | 387-786-2998 | 68 | brooklyn.roach@warst.biz | 085-88-3973 | + Skylar | | Bradshaw | skylarbradshaw@yahoo.com | skylarb | p6eC6cKy | 210-872-608 | 96 | skylar.bradshaw@flyhigh.eu | 453-46-0334 | +. +. +. + +``` + +You can query the mock data. + +## Query your own data +If you want to query your own data, you need to ingest your own data first. You can write a simple producer and write custom defined data to Pulsar. The following is an example. + +```java + +public class TestProducer { + + public static class Foo { + private int field1 = 1; + private String field2; + private long field3; + + public Foo() { + } + + public int getField1() { + return field1; + } + + public void setField1(int field1) { + this.field1 = field1; + } + + public String getField2() { + return field2; + } + + public void setField2(String field2) { + this.field2 = field2; + } + + public long getField3() { + return field3; + } + + public void setField3(long field3) { + this.field3 = field3; + } + } + + public static void main(String[] args) throws Exception { + PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + Producer producer = pulsarClient.newProducer(AvroSchema.of(Foo.class)).topic("test_topic").create(); + + for (int i = 0; i < 1000; i++) { + Foo foo = new Foo(); + foo.setField1(i); + foo.setField2("foo" + i); + foo.setField3(System.currentTimeMillis()); + producer.newMessage().value(foo).send(); + } + producer.close(); + pulsarClient.close(); + } +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/sql-overview.md b/site2/website/versioned_docs/version-2.9.x/sql-overview.md new file mode 100644 index 0000000000000..8ba19d053003d --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/sql-overview.md @@ -0,0 +1,18 @@ +--- +id: sql-overview +title: Pulsar SQL Overview +sidebar_label: "Overview" +original_id: sql-overview +--- + +Apache Pulsar is used to store streams of event data, and the event data is structured with predefined fields. With the implementation of the [Schema Registry](schema-get-started.md), you can store structured data in Pulsar and query the data by using [Trino (formerly Presto SQL)](https://trino.io/). + +As the core of Pulsar SQL, Presto Pulsar connector enables Presto workers within a Presto cluster to query data from Pulsar. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-2.png) + +The query performance is efficient and highly scalable, because Pulsar adopts [two level segment based architecture](concepts-architecture-overview.md#apache-bookkeeper). + +Topics in Pulsar are stored as segments in [Apache BookKeeper](https://bookkeeper.apache.org/). Each topic segment is replicated to some BookKeeper nodes, which enables concurrent reads and high read throughput. You can configure the number of BookKeeper nodes, and the default number is `3`. In Presto Pulsar connector, data is read directly from BookKeeper, so Presto workers can read concurrently from horizontally scalable number BookKeeper nodes. + +![The Pulsar consumer and reader interfaces](/assets/pulsar-sql-arch-1.png) diff --git a/site2/website/versioned_docs/version-2.9.x/sql-rest-api.md b/site2/website/versioned_docs/version-2.9.x/sql-rest-api.md new file mode 100644 index 0000000000000..c92fd62f7d870 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/sql-rest-api.md @@ -0,0 +1,192 @@ +--- +id: sql-rest-api +title: Pulsar SQL REST APIs +sidebar_label: "REST APIs" +original_id: sql-rest-api +--- + +This section lists resources that make up the Presto REST API v1. + +## Request for Presto services + +All requests for Presto services should use Presto REST API v1 version. + +To request services, use explicit URL `http://presto.service:8081/v1`. You need to update `presto.service:8081` with your real Presto address before sending requests. + +`POST` requests require the `X-Presto-User` header. If you use authentication, you must use the same `username` that is specified in the authentication configuration. If you do not use authentication, you can specify anything for `username`. + +```properties + +X-Presto-User: username + +``` + +For more information about headers, refer to [PrestoHeaders](https://github.com/trinodb/trino). + +## Schema + +You can use statement in the HTTP body. All data is received as JSON document that might contain a `nextUri` link. If the received JSON document contains a `nextUri` link, the request continues with the `nextUri` link until the received data does not contain a `nextUri` link. If no error is returned, the query completes successfully. If an `error` field is displayed in `stats`, it means the query fails. + +The following is an example of `show catalogs`. The query continues until the received JSON document does not contain a `nextUri` link. Since no `error` is displayed in `stats`, it means that the query completes successfully. + +```powershell + +➜ ~ curl --header "X-Presto-User: test-user" --request POST --data 'show catalogs' http://localhost:8081/v1/statement +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "stats" : { + "queued" : true, + "nodes" : 0, + "userTimeMillis" : 0, + "cpuTimeMillis" : 0, + "wallTimeMillis" : 0, + "processedBytes" : 0, + "processedRows" : 0, + "runningSplits" : 0, + "queuedTimeMillis" : 0, + "queuedSplits" : 0, + "completedSplits" : 0, + "totalSplits" : 0, + "scheduled" : false, + "peakMemoryBytes" : 0, + "state" : "QUEUED", + "elapsedTimeMillis" : 0 + }, + "id" : "20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1" +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1 +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2", + "id" : "20191113_033653_00006_dg6hb", + "stats" : { + "state" : "PLANNING", + "totalSplits" : 0, + "queued" : false, + "userTimeMillis" : 0, + "completedSplits" : 0, + "scheduled" : false, + "wallTimeMillis" : 0, + "runningSplits" : 0, + "queuedSplits" : 0, + "cpuTimeMillis" : 0, + "processedRows" : 0, + "processedBytes" : 0, + "nodes" : 0, + "queuedTimeMillis" : 1, + "elapsedTimeMillis" : 2, + "peakMemoryBytes" : 0 + } +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2 +{ + "id" : "20191113_033653_00006_dg6hb", + "data" : [ + [ + "pulsar" + ], + [ + "system" + ] + ], + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "columns" : [ + { + "typeSignature" : { + "rawType" : "varchar", + "arguments" : [ + { + "kind" : "LONG_LITERAL", + "value" : 6 + } + ], + "literalArguments" : [], + "typeArguments" : [] + }, + "name" : "Catalog", + "type" : "varchar(6)" + } + ], + "stats" : { + "wallTimeMillis" : 104, + "scheduled" : true, + "userTimeMillis" : 14, + "progressPercentage" : 100, + "totalSplits" : 19, + "nodes" : 1, + "cpuTimeMillis" : 16, + "queued" : false, + "queuedTimeMillis" : 1, + "state" : "FINISHED", + "peakMemoryBytes" : 0, + "elapsedTimeMillis" : 111, + "processedBytes" : 0, + "processedRows" : 0, + "queuedSplits" : 0, + "rootStage" : { + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1, + "subStages" : [ + { + "cpuTimeMillis" : 14, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 17, + "subStages" : [ + { + "wallTimeMillis" : 7, + "subStages" : [], + "stageId" : "2", + "done" : true, + "nodes" : 1, + "totalSplits" : 1, + "processedBytes" : 22, + "processedRows" : 2, + "queuedSplits" : 0, + "userTimeMillis" : 1, + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1 + } + ], + "wallTimeMillis" : 92, + "nodes" : 1, + "done" : true, + "stageId" : "1", + "userTimeMillis" : 12, + "processedRows" : 2, + "processedBytes" : 51, + "queuedSplits" : 0, + "totalSplits" : 17 + } + ], + "wallTimeMillis" : 5, + "done" : true, + "nodes" : 1, + "stageId" : "0", + "userTimeMillis" : 1, + "processedRows" : 2, + "processedBytes" : 22, + "totalSplits" : 1, + "queuedSplits" : 0 + }, + "runningSplits" : 0, + "completedSplits" : 19 + } +} + +``` + +:::note + +Since the response data is not in sync with the query state from the perspective of clients, you cannot rely on the response data to determine whether the query completes. + +::: + +For more information about Presto REST API, refer to [Presto HTTP Protocol](https://github.com/prestosql/presto/wiki/HTTP-Protocol). diff --git a/site2/website/versioned_docs/version-2.9.x/standalone-docker.md b/site2/website/versioned_docs/version-2.9.x/standalone-docker.md new file mode 100644 index 0000000000000..1afb9bfd3f25f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/standalone-docker.md @@ -0,0 +1,213 @@ +--- +id: standalone-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: "Run Pulsar in Docker" +original_id: standalone-docker +--- + +For local development and testing, you can run Pulsar in standalone mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) and follow the instructions for your OS. + +## Start Pulsar in Docker + +* For MacOS, Linux, and Windows: + + ```shell + + $ docker run -it -p 6650:6650 -p 8080:8080 --mount source=pulsardata,target=/pulsar/data --mount source=pulsarconf,target=/pulsar/conf apachepulsar/pulsar:@pulsar:version@ bin/pulsar standalone + + ``` + +A few things to note about this command: + * The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every +time the container is restarted. For details on the volumes you can use `docker volume inspect ` + * For Docker on Windows make sure to configure it to use Linux containers + +If you start Pulsar successfully, you will see `INFO`-level log messages like this: + +``` + +08:18:30.970 [main] INFO org.apache.pulsar.broker.web.WebService - HTTP Service started at http://0.0.0.0:8080 +... +07:53:37.322 [main] INFO org.apache.pulsar.broker.PulsarService - messaging service is ready, bootstrap service port = 8080, broker url= pulsar://localhost:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@98b63c1 +... + +``` + +:::tip + +When you start a local standalone cluster, a `public/default` + +::: + +namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. +For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar in Docker + +Pulsar offers client libraries for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs to interact with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example will guide you get started with Pulsar quickly by using the [Python client API](client-libraries-python.md) +client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell + +$ pip install pulsar-client + +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() + +``` + +### Produce a message + +Now start a producer to send some test messages: + +```python + +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() + +``` + +## Get the topic statistics + +In Pulsar, you can use REST, Java, or command-line tools to control every aspect of the system. +For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell + +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool + +``` + +The output is something like this: + +```json + +{ + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesInCounter": 7097, + "msgInCounter": 143, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "averageMsgSize": 0.0, + "msgChunkPublished": false, + "storageSize": 7097, + "backlogSize": 0, + "offloadedStorageSize": 0, + "publishers": [ + { + "accessMode": "Shared", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "averageMsgSize": 0.0, + "chunkedMessageRate": 0.0, + "producerId": 0, + "metadata": {}, + "address": "/127.0.0.1:35604", + "connectedSince": "2021-07-04T09:05:43.04788Z", + "clientVersion": "2.8.0", + "producerName": "standalone-2-5" + } + ], + "waitingPublishers": 0, + "subscriptions": { + "my-sub": { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0, + "msgBacklog": 0, + "backlogSize": 0, + "msgBacklogNoDelayed": 0, + "blockedSubscriptionOnUnackedMsgs": false, + "msgDelayed": 0, + "unackedMessages": 0, + "type": "Exclusive", + "activeConsumerName": "3c544f1daa", + "msgRateExpired": 0.0, + "totalMsgExpired": 0, + "lastExpireTimestamp": 0, + "lastConsumedFlowTimestamp": 1625389101290, + "lastConsumedTimestamp": 1625389546070, + "lastAckedTimestamp": 1625389546162, + "lastMarkDeleteAdvancedTimestamp": 1625389546163, + "consumers": [ + { + "msgRateOut": 1.8332950480217471, + "msgThroughputOut": 91.33142602871978, + "bytesOutCounter": 6607, + "msgOutCounter": 133, + "msgRateRedeliver": 0.0, + "chunkedMessageRate": 0.0, + "consumerName": "3c544f1daa", + "availablePermits": 867, + "unackedMessages": 0, + "avgMessagesPerEntry": 6, + "blockedConsumerOnUnackedMsgs": false, + "lastAckedTimestamp": 1625389546162, + "lastConsumedTimestamp": 1625389546070, + "metadata": {}, + "address": "/127.0.0.1:35472", + "connectedSince": "2021-07-04T08:58:21.287682Z", + "clientVersion": "2.8.0" + } + ], + "isDurable": true, + "isReplicated": false, + "allowOutOfOrderDelivery": false, + "consumersAfterMarkDeletePosition": {}, + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0, + "durable": true, + "replicated": false + } + }, + "replication": {}, + "deduplicationStatus": "Disabled", + "nonContiguousDeletedMessagesRanges": 0, + "nonContiguousDeletedMessagesRangesSerializedSize": 0 +} + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/standalone.md b/site2/website/versioned_docs/version-2.9.x/standalone.md new file mode 100644 index 0000000000000..6236f40514093 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/standalone.md @@ -0,0 +1,268 @@ +--- +id: standalone +title: Set up a standalone Pulsar locally +sidebar_label: "Run Pulsar locally" +original_id: standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> **Pulsar in production?** +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of installing Pulsar locally. + +### System requirements + +Currently, Pulsar is available for 64-bit **macOS**, **Linux**, and **Windows**. To use Pulsar, you need to install 64-bit JRE/JDK 8 or later versions, JRE/JDK 11 is recommended. + +:::tip + +By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +::: + +:::note + +Broker is only supported on 64-bit JVM. + +::: + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar @pulsar:version@ binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:binary_release_url + + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash + +$ tar xvfz apache-pulsar-@pulsar:version@-bin.tar.gz +$ cd apache-pulsar-@pulsar:version@ + +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](https://pulsar.apache.org/tools/pulsar-admin/). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +:::tip + +If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +* [Install builtin connectors (optional)](#install-builtin-connectors-optional) +* [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +::: + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:connector_release_url/{connector}-@pulsar:version@.nar + + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-@pulsar:version@.nar` connector file, enter the following commands: + +```bash + +$ mkdir connectors +$ mv pulsar-io-aerospike-@pulsar:version@.nar connectors + +$ ls connectors +pulsar-io-aerospike-@pulsar:version@.nar +... + +``` + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +::: + +### Install tiered storage offloaders (optional) + +:::tip + +- Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +- To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +::: + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders @pulsar:version@ release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + + $ wget pulsar:offloader_release_url + + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash + +$ tar xvfz apache-pulsar-offloaders-@pulsar:version@-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-@pulsar:version@` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-@pulsar:version@/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-@pulsar:version@.nar + +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +:::note + +* If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +* If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DC/OS](https://dcos.io/)), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +::: + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash + +$ bin/pulsar standalone + +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash + +21:59:29.327 [DLM-/stream/storage-OrderedScheduler-3-0] INFO org.apache.bookkeeper.stream.storage.impl.sc.StorageContainerImpl - Successfully started storage container (0). +21:59:34.576 [main] INFO org.apache.pulsar.broker.authentication.AuthenticationService - Authentication is disabled +21:59:34.576 [main] INFO org.apache.pulsar.websocket.WebSocketService - Pulsar WebSocket Service started + +``` + +:::tip + +* The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. + +::: + +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client consume my-topic -s "first-subscription" + +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:17:16.781 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully consumed + +``` + +:::tip + +As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +::: + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash + +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" + +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` + +22:21:08.693 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced + +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +:::tip + +If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). + +::: + diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-aliyun.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-aliyun.md new file mode 100644 index 0000000000000..2486b92df485b --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-aliyun.md @@ -0,0 +1,259 @@ +--- +id: tiered-storage-aliyun +title: Use Aliyun OSS offloader with Pulsar +sidebar_label: "Aliyun OSS offloader" +original_id: tiered-storage-aliyun +--- + +This chapter guides you through every step of installing and configuring the Aliyun Object Storage Service (OSS) offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Aliyun OSS offloader. + +### Prerequisite + +- Pulsar: 2.8.0 or later versions + +### Step + +This example uses Pulsar 2.8.0. + +1. Download the Pulsar tarball, see [here](https://pulsar.apache.org/docs/en/standalone/#install-pulsar-using-binary-release). + +2. Download and untar the Pulsar offloaders package, then copy the Pulsar offloaders as `offloaders` in the Pulsar directory, see [here](https://pulsar.apache.org/docs/en/standalone/#install-tiered-storage-offloaders-optional). + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/), [Azure](https://portal.azure.com/#home), and [Aliyun OSS](https://www.aliyun.com/product/oss) for long-term storage. + + ``` + + tiered-storage-file-system-2.8.0.nar + tiered-storage-jcloud-2.8.0.nar + + ``` + + :::note + + * If you are running Pulsar in a bare-metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image. The `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Aliyun OSS, you need to configure some properties of the Aliyun OSS offload driver. + +::: + +Besides, you can also configure the Aliyun OSS offloader to run it automatically or trigger it manually. + +### Configure Aliyun OSS offloader driver + +You can configure the Aliyun OSS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + | Required configuration | Description | Example value | + | --- | --- |--- | + | `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | aliyun-oss | + | `offloadersDirectory` | Offloader directory | offloaders | + | `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload | + | `managedLedgerOffloadServiceEndpoint` | Endpoint | http://oss-cn-hongkong.aliyuncs.com | + +- **Optional** configurations are as below. + + | Optional | Description | Example value | + | --- | --- | --- | + | `managedLedgerOffloadReadBufferSizeInBytes` | Size of block read | 1 MB | + | `managedLedgerOffloadMaxBlockSizeInBytes` | Size of block write | 64 MB | + | `managedLedgerMinLedgerRolloverTimeMinutes` | Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment. | 2 | + | `managedLedgerMaxEntriesPerLedger` | Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment. | 5000 | + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Aliyun OSS must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Endpoint (required) + +The endpoint is the region where a bucket is located. + +:::tip + +For more information about Aliyun OSS regions and endpoints, see [International website](https://www.alibabacloud.com/help/doc-detail/31837.htm) or [Chinese website](https://help.aliyun.com/document_detail/31837.html). + +::: + + +##### Example + +This example sets the endpoint as _oss-us-west-1-internal_. + +``` + +managedLedgerOffloadServiceEndpoint=http://oss-us-west-1-internal.aliyuncs.com + +``` + +#### Authentication (required) + +To be able to access Aliyun OSS, you need to authenticate with Aliyun OSS. + +Set the environment variables `ALIYUN_OSS_ACCESS_KEY_ID` and `ALIYUN_OSS_ACCESS_KEY_SECRET` in `conf/pulsar_env.sh`. + +"export" is important so that the variables are made available in the environment of spawned processes. + +```bash + +export ALIYUN_OSS_ACCESS_KEY_ID=ABC123456789 +export ALIYUN_OSS_ACCESS_KEY_SECRET=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + +``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Aliyun OSS in the configuration file `broker.conf` or `standalone.conf`. + +| Configuration | Description | Default value | +| --- | --- | --- | +| `managedLedgerOffloadReadBufferSizeInBytes` | Block size for each individual read when reading back data from Aliyun OSS. | 1 MB | +| `managedLedgerOffloadMaxBlockSizeInBytes` | Maximum size of a "part" sent during a multipart upload to Aliyun OSS. It **cannot** be smaller than 5 MB. | 64 MB | + +### Run Aliyun OSS offloader automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +| Threshold value | Action | +| --- | --- | +| > 0 | It triggers the offloading operation if the topic storage reaches its threshold. | +| = 0 | It causes a broker to offload data as soon as possible. | +| < 0 | It disables automatic offloading operation. | + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, the offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the Aliyun OSS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Run Aliyun OSS offloader manually + +For individual topics, you can trigger the Aliyun OSS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Aliyun OSS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Aliyun OSS offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the Aliyun OSS offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Aliyun OSS offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + +` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-aws.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-aws.md new file mode 100644 index 0000000000000..20a6382e770cc --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-aws.md @@ -0,0 +1,331 @@ +--- +id: tiered-storage-aws +title: Use AWS S3 offloader with Pulsar +sidebar_label: "AWS S3 offloader" +original_id: tiered-storage-aws +--- + +This chapter guides you through every step of installing and configuring the AWS S3 offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the AWS S3 offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [downloads page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/) and [GCS](https://cloud.google.com/storage/) for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to AWS S3, you need to configure some properties of the AWS S3 offload driver. + +::: + +Besides, you can also configure the AWS S3 offloader to run it automatically or trigger it manually. + +### Configure AWS S3 offloader driver + +You can configure the AWS S3 offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive.

    **Note**: there is a third driver type, S3, which is identical to AWS S3, though S3 requires that you specify an endpoint URL using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if using an S3 compatible data store other than AWS S3. | aws-s3 + `offloadersDirectory` | Offloader directory | offloaders + `s3ManagedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `s3ManagedLedgerOffloadRegion` | Bucket region

    **Note**: before specifying a value for this parameter, you need to set the following configurations. Otherwise, you might get an error.

    - Set [`s3ManagedLedgerOffloadServiceEndpoint`](https://docs.aws.amazon.com/general/latest/gr/s3.html).

    Example
    `s3ManagedLedgerOffloadServiceEndpoint=https://s3.YOUR_REGION.amazonaws.com`

    - Grant `GetBucketLocation` permission to a user.

    For how to grant `GetBucketLocation` permission to a user, see [here](https://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html#using-with-s3-actions-related-to-buckets).| eu-west-3 + `s3ManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in AWS S3 must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +s3ManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region + +A bucket region is a region where a bucket is located. If a bucket region is not specified, the **default** region (`US East (N. Virginia)`) is used. + +:::tip + +For more information about AWS regions and endpoints, see [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). + +::: + + +##### Example + +This example sets the bucket region as _europe-west-3_. + +``` + +s3ManagedLedgerOffloadRegion=eu-west-3 + +``` + +#### Authentication (required) + +To be able to access AWS S3, you need to authenticate with AWS S3. + +Pulsar does not provide any direct methods of configuring authentication for AWS S3, +but relies on the mechanisms supported by the [DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, you can configure credentials using one of the following methods. + +* Use EC2 instance metadata credentials. + + If you are on AWS instance with an instance profile that provides credentials, Pulsar uses these credentials if no other mechanism is provided. + +* Set the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AWS_ACCESS_KEY_ID=ABC123456789 + export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Add the Java system properties `aws.accessKeyId` and `aws.secretKey` to `PULSAR_EXTRA_OPTS` in `conf/pulsar_env.sh`. + + ```bash + + PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacityPerThread=4096" + + ``` + +* Set the access credentials in `~/.aws/credentials`. + + ```conf + + [default] + aws_access_key_id=ABC123456789 + aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +* Assume an IAM role. + + This example uses the `DefaultAWSCredentialsProviderChain` for assuming this role. + + The broker must be rebooted for credentials specified in `pulsar_env` to take effect. + + ```conf + + s3ManagedLedgerOffloadRole= + s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from AWS S3 in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`s3ManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from AWS S3.|1 MB +`s3ManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to AWS S3. It **cannot** be smaller than 5 MB. |64 MB + +### Configure AWS S3 offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the AWS S3 offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Configure AWS S3 offloader to run manually + +For individual topics, you can trigger AWS S3 offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to AWS S3 until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the AWS S3 offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the AWS S3 offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the AWS S3 offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + +` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the AWS S3 offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/aws-s3/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-azure.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-azure.md new file mode 100644 index 0000000000000..5923a33147135 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-azure.md @@ -0,0 +1,266 @@ +--- +id: tiered-storage-azure +title: Use Azure BlobStore offloader with Pulsar +sidebar_label: "Azure BlobStore offloader" +original_id: tiered-storage-azure +--- + +This chapter guides you through every step of installing and configuring the Azure BlobStore offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the Azure BlobStore offloader. + +### Prerequisite + +- Pulsar: 2.6.2 or later versions + +### Step + +This example uses Pulsar 2.6.2. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz) + + * Download from the Pulsar [downloads page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget): + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.6.2/apache-pulsar-2.6.2-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.6.2/apache-pulsar-offloaders-2.6.2-bin.tar.gz + tar xvfz apache-pulsar-offloaders-2.6.2-bin.tar.gz + + ``` + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.6.2/offloaders apache-pulsar-2.6.2/offloaders + + ls offloaders + + ``` + + **Output** + + As shown from the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support [AWS S3](https://aws.amazon.com/s3/), [GCS](https://cloud.google.com/storage/) and [Azure](https://portal.azure.com/#home) for long term storage. + + ``` + + tiered-storage-file-system-2.6.2.nar + tiered-storage-jcloud-2.6.2.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to Azure BlobStore, you need to configure some properties of the Azure BlobStore offload driver. + +::: + +Besides, you can also configure the Azure BlobStore offloader to run it automatically or trigger it manually. + +### Configure Azure BlobStore offloader driver + +You can configure the Azure BlobStore offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name | azureblob + `offloadersDirectory` | Offloader directory | offloaders + `managedLedgerOffloadBucket` | Bucket | pulsar-topic-offload + +- **Optional** configurations are as below. + + Optional | Description | Example value + |---|---|--- + `managedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `managedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in Azure BlobStore must be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you cannot nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +managedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Authentication (required) + +To be able to access Azure BlobStore, you need to authenticate with Azure BlobStore. + +* Set the environment variables `AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` in `conf/pulsar_env.sh`. + + "export" is important so that the variables are made available in the environment of spawned processes. + + ```bash + + export AZURE_STORAGE_ACCOUNT=ABC123456789 + export AZURE_STORAGE_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c + + ``` + +#### Size of block read/write + +You can configure the size of a request sent to or read from Azure BlobStore in the configuration file `broker.conf` or `standalone.conf`. + +Configuration|Description|Default value +|---|---|--- +`managedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from Azure BlobStore store.|1 MB +`managedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to Azure BlobStore store. It **cannot** be smaller than 5 MB. |64 MB + +### Configure Azure BlobStore offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offloading operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the Azure BlobStore offloader threshold size to 10 MB using pulsar-admin. + +```bash + +bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-threshold-em-). + +::: + +### Configure Azure BlobStore offloader to run manually + +For individual topics, you can trigger Azure BlobStore offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger it via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to Azure BlobStore until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the Azure BlobStore offloader to run manually using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-em-). + + ::: + +- This example checks the Azure BlobStore offloader status using pulsar-admin. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the Azure BlobStore offloader to complete the job, add the `-w` flag. + + ```bash + + bin/pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + bin/pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: + + ``` + +` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, and default values, see [here](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-offload-status-em-). + + ::: + diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-filesystem.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-filesystem.md new file mode 100644 index 0000000000000..a5844d22fb5db --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-filesystem.md @@ -0,0 +1,317 @@ +--- +id: tiered-storage-filesystem +title: Use filesystem offloader with Pulsar +sidebar_label: "Filesystem offloader" +original_id: tiered-storage-filesystem +--- + +This chapter guides you through every step of installing and configuring the filesystem offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the filesystem offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +- Hadoop: 3.x.x + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [download page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget) + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8s and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +## Configuration + +:::note + +Before offloading data from BookKeeper to filesystem, you need to configure some properties of the filesystem offloader driver. + +::: + +Besides, you can also configure the filesystem offloader to run it automatically or trigger it manually. + +### Configure filesystem offloader driver + +You can configure filesystem offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + Required configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver` | Offloader driver name, which is case-insensitive. | filesystem + `fileSystemURI` | Connection address | hdfs://127.0.0.1:9000 + `fileSystemProfilePath` | Hadoop profile path | ../conf/filesystem_offload_core_site.xml + +- **Optional** configurations are as below. + + Optional configuration| Description | Example value + |---|---|--- + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic

    **Note**: it is not recommended that you set this configuration in the production environment.|2 + `managedLedgerMaxEntriesPerLedger`|Maximum number of entries to append to a ledger before triggering a rollover.

    **Note**: it is not recommended that you set this configuration in the production environment.|5000 + +#### Offloader driver (required) + +Offloader driver name, which is case-insensitive. + +This example sets the offloader driver name as _filesystem_. + +```conf + +managedLedgerOffloadDriver=filesystem + +``` + +#### Connection address (required) + +Connection address is the URI to access the default Hadoop distributed file system. + +##### Example + +This example sets the connection address as _hdfs://127.0.0.1:9000_. + +```conf + +fileSystemURI=hdfs://127.0.0.1:9000 + +``` + +#### Hadoop profile path (required) + +The configuration file is stored in the Hadoop profile path. It contains various settings for Hadoop performance tuning. + +##### Example + +This example sets the Hadoop profile path as _../conf/filesystem_offload_core_site.xml_. + +```conf + +fileSystemProfilePath=../conf/filesystem_offload_core_site.xml + +``` + +You can set the following configurations in the _filesystem_offload_core_site.xml_ file. + +``` + + + fs.defaultFS + + + + + hadoop.tmp.dir + pulsar + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + +``` + +:::tip + +For more information about the Hadoop HDFS, see [here](https://hadoop.apache.org/docs/current/). + +::: + +### Configure filesystem offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +#### Example + +This example sets the filesystem offloader threshold size to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Configure filesystem offloader to run manually + +For individual topics, you can trigger filesystem offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + +To trigger via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are offloaded to the filesystem until the threshold is no longer exceeded. Older segments are offloaded first. + +#### Example + +- This example triggers the filesystem offloader to run manually using pulsar-admin. + + ```bash + + pulsar-admin topics offload --size-threshold 10M persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks filesystem offloader status using pulsar-admin. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for the filesystem to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in the offloading operation, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + +` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the filesystem offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/filesystem/2.5.1). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-gcs.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-gcs.md new file mode 100644 index 0000000000000..afb1e9a10081c --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-gcs.md @@ -0,0 +1,321 @@ +--- +id: tiered-storage-gcs +title: Use GCS offloader with Pulsar +sidebar_label: "GCS offloader" +original_id: tiered-storage-gcs +--- + +This chapter guides you through every step of installing and configuring the GCS offloader and using it with Pulsar. + +## Installation + +Follow the steps below to install the GCS offloader. + +### Prerequisite + +- Pulsar: 2.4.2 or later versions + +### Step + +This example uses Pulsar 2.5.1. + +1. Download the Pulsar tarball using one of the following ways: + + * Download from the [Apache mirror](https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz) + + * Download from the Pulsar [download page](https://pulsar.apache.org/download) + + * Use [wget](https://www.gnu.org/software/wget) + + ```shell + + wget https://archive.apache.org/dist/pulsar/pulsar-2.5.1/apache-pulsar-2.5.1-bin.tar.gz + + ``` + +2. Download and untar the Pulsar offloaders package. + + ```bash + + wget https://downloads.apache.org/pulsar/pulsar-2.5.1/apache-pulsar-offloaders-2.5.1-bin.tar.gz + + tar xvfz apache-pulsar-offloaders-2.5.1-bin.tar.gz + + ``` + + :::note + + * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's Pulsar directory. + * If you are running Pulsar in Docker or deploying Pulsar using a Docker image (such as K8S and DCOS), you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + + ::: + +3. Copy the Pulsar offloaders as `offloaders` in the Pulsar directory. + + ``` + + mv apache-pulsar-offloaders-2.5.1/offloaders apache-pulsar-2.5.1/offloaders + + ls offloaders + + ``` + + **Output** + + As shown in the output, Pulsar uses [Apache jclouds](https://jclouds.apache.org) to support GCS and AWS S3 for long term storage. + + ``` + + tiered-storage-file-system-2.5.1.nar + tiered-storage-jcloud-2.5.1.nar + + ``` + +## Configuration + +:::note + +Before offloading data from BookKeeper to GCS, you need to configure some properties of the GCS offloader driver. + +::: + +Besides, you can also configure the GCS offloader to run it automatically or trigger it manually. + +### Configure GCS offloader driver + +You can configure GCS offloader driver in the configuration file `broker.conf` or `standalone.conf`. + +- **Required** configurations are as below. + + **Required** configuration | Description | Example value + |---|---|--- + `managedLedgerOffloadDriver`|Offloader driver name, which is case-insensitive.|google-cloud-storage + `offloadersDirectory`|Offloader directory|offloaders + `gcsManagedLedgerOffloadBucket`|Bucket|pulsar-topic-offload + `gcsManagedLedgerOffloadRegion`|Bucket region|europe-west3 + `gcsManagedLedgerOffloadServiceAccountKeyFile`|Authentication |/Users/user-name/Downloads/project-804d5e6a6f33.json + +- **Optional** configurations are as below. + + Optional configuration|Description|Example value + |---|---|--- + `gcsManagedLedgerOffloadReadBufferSizeInBytes`|Size of block read|1 MB + `gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Size of block write|64 MB + `managedLedgerMinLedgerRolloverTimeMinutes`|Minimum time between ledger rollover for a topic.|2 + `managedLedgerMaxEntriesPerLedger`|The max number of entries to append to a ledger before triggering a rollover.|5000 + +#### Bucket (required) + +A bucket is a basic container that holds your data. Everything you store in GCS **must** be contained in a bucket. You can use a bucket to organize your data and control access to your data, but unlike directory and folder, you can not nest a bucket. + +##### Example + +This example names the bucket as _pulsar-topic-offload_. + +```conf + +gcsManagedLedgerOffloadBucket=pulsar-topic-offload + +``` + +#### Bucket region (required) + +Bucket region is the region where a bucket is located. If a bucket region is not specified, the **default** region (`us multi-regional location`) is used. + +:::tip + +For more information about bucket location, see [here](https://cloud.google.com/storage/docs/bucket-locations). + +::: + +##### Example + +This example sets the bucket region as _europe-west3_. + +``` + +gcsManagedLedgerOffloadRegion=europe-west3 + +``` + +#### Authentication (required) + +To enable a broker access GCS, you need to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in the configuration file `broker.conf`. + +`gcsManagedLedgerOffloadServiceAccountKeyFile` is +a JSON file, containing GCS credentials of a service account. + +##### Example + +To generate service account credentials or view the public credentials that you've already generated, follow the following steps. + +1. Navigate to the [Service accounts page](https://console.developers.google.com/iam-admin/serviceaccounts). + +2. Select a project or create a new one. + +3. Click **Create service account**. + +4. In the **Create service account** window, type a name for the service account and select **Furnish a new private key**. + + If you want to [grant G Suite domain-wide authority](https://developers.google.com/identity/protocols/OAuth2ServiceAccount#delegatingauthority) to the service account, select **Enable G Suite Domain-wide Delegation**. + +5. Click **Create**. + + :::note + + Make sure the service account you create has permission to operate GCS, you need to assign **Storage Admin** permission to your service account [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +6. You can get the following information and set this in `broker.conf`. + + ```conf + + gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/user-name/Downloads/project-804d5e6a6f33.json" + + ``` + + :::tip + + - For more information about how to create `gcsManagedLedgerOffloadServiceAccountKeyFile`, see [here](https://support.google.com/googleapi/answer/6158849). + - For more information about Google Cloud IAM, see [here](https://cloud.google.com/storage/docs/access-control/iam). + + ::: + +#### Size of block read/write + +You can configure the size of a request sent to or read from GCS in the configuration file `broker.conf`. + +Configuration|Description +|---|--- +`gcsManagedLedgerOffloadReadBufferSizeInBytes`|Block size for each individual read when reading back data from GCS.

    The **default** value is 1 MB. +`gcsManagedLedgerOffloadMaxBlockSizeInBytes`|Maximum size of a "part" sent during a multipart upload to GCS.

    It **can not** be smaller than 5 MB.

    The **default** value is 64 MB. + +### Configure GCS offloader to run automatically + +Namespace policy can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that a topic has stored on a Pulsar cluster. Once the topic reaches the threshold, an offload operation is triggered automatically. + +Threshold value|Action +|---|--- +> 0 | It triggers the offloading operation if the topic storage reaches its threshold. += 0|It causes a broker to offload data as soon as possible. +< 0 |It disables automatic offloading operation. + +Automatic offloading runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offloader does not work until the current segment is full. + +You can configure the threshold size using CLI tools, such as pulsar-admin. + +The offload configurations in `broker.conf` and `standalone.conf` are used for the namespaces that do not have namespace level offload policies. Each namespace can have its own offload policy. If you want to set offload policy for each namespace, use the command [`pulsar-admin namespaces set-offload-policies options`](https://pulsar.apache.org/tools/pulsar-admin/2.6.0-SNAPSHOT/#-em-set-offload-policies-em-) command. + +#### Example + +This example sets the GCS offloader threshold size to 10 MB using pulsar-admin. + +```bash + +pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace + +``` + +:::tip + +For more information about the `pulsar-admin namespaces set-offload-threshold options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#set-offload-threshold). + +::: + +### Configure GCS offloader to run manually + +For individual topics, you can trigger GCS offloader manually using one of the following methods: + +- Use REST endpoint. + +- Use CLI tools (such as pulsar-admin). + + To trigger the GCS via CLI tools, you need to specify the maximum amount of data (threshold) that should be retained on a Pulsar cluster for a topic. If the size of the topic data on the Pulsar cluster exceeds this threshold, segments from the topic are moved to GCS until the threshold is no longer exceeded. Older segments are moved first. + +#### Example + +- This example triggers the GCS offloader to run manually using pulsar-admin with the command `pulsar-admin topics offload (topic-name) (threshold)`. + + ```bash + + pulsar-admin topics offload persistent://my-tenant/my-namespace/topic1 10M + + ``` + + **Output** + + ```bash + + Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 + + ``` + + :::tip + + For more information about the `pulsar-admin topics offload options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload). + + ::: + +- This example checks the GCS offloader status using pulsar-admin with the command `pulsar-admin topics offload-status options`. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ```bash + + Offload is currently running + + ``` + + To wait for GCS to complete the job, add the `-w` flag. + + ```bash + + pulsar-admin topics offload-status -w persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Offload was a success + + ``` + + If there is an error in offloading, the error is propagated to the `pulsar-admin topics offload-status` command. + + ```bash + + pulsar-admin topics offload-status persistent://my-tenant/my-namespace/topic1 + + ``` + + **Output** + + ``` + + Error in offload + null + + Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= + + ``` + +` + + :::tip + + For more information about the `pulsar-admin topics offload-status options` command, including flags, descriptions, default values, and shorthands, see [here](reference-pulsar-admin.md#offload-status). + + ::: + +## Tutorial + +For the complete and step-by-step instructions on how to use the GCS offloader with Pulsar, see [here](https://hub.streamnative.io/offloaders/gcs/2.5.1#usage). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/tiered-storage-overview.md b/site2/website/versioned_docs/version-2.9.x/tiered-storage-overview.md new file mode 100644 index 0000000000000..c635034f463b4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/tiered-storage-overview.md @@ -0,0 +1,52 @@ +--- +id: tiered-storage-overview +title: Overview of tiered storage +sidebar_label: "Overview" +original_id: tiered-storage-overview +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be moved from BookKeeper to long term and cheaper storage, while still allowing clients to access the backlog as if nothing has changed. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support [Amazon S3](https://aws.amazon.com/s3/) and [GCS (Google Cloud Storage)](https://cloud.google.com/storage/) for long term storage. + + With jclouds, it is easy to add support for more [cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + + :::tip + + - For more information about how to use the AWS S3 offloader with Pulsar, see [here](tiered-storage-aws.md). + + - For more information about how to use the GCS offloader with Pulsar, see [here](tiered-storage-gcs.md). + + ::: + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystems for long term storage. + + With Hadoop, it is easy to add support for more filesystems in the future. + + :::tip + + For more information about how to use the filesystem offloader with Pulsar, see [here](tiered-storage-filesystem.md). + + ::: + +## When to use tiered storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. + +For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm, you can rerun it against your full user history. + +## How does tiered storage work? + +A topic in Pulsar is backed by a **log**, known as a **managed ledger**. This log is composed of an ordered list of segments. Pulsar only writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a **segment oriented architecture**. + +![Tiered storage](/assets/pulsar-tiered-storage.png "Tiered Storage") + +The tiered storage offloading mechanism takes advantage of segment oriented architecture. When offloading is requested, the segments of the log are copied one-by-one to tiered storage. All segments of the log (apart from the current segment) written to tiered storage can be offloaded. + +Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper, it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Before offloading ledgers to long term storage, you need to configure buckets, credentials, and other properties for the cloud storage service. Additionally, Pulsar uses multi-part objects to upload the segment data and brokers may crash while uploading the data. It is recommended that you add a life cycle rule for your bucket to expire incomplete multi-part upload after a day or two days to avoid getting charged for incomplete uploads. Moreover, you can trigger the offloading operation manually (via REST API or CLI) or automatically (via CLI). + +After offloading ledgers to long term storage, you can still query data in the offloaded ledgers with Pulsar SQL. + +For more information about tiered storage for Pulsar topics, see [here](https://github.com/apache/pulsar/wiki/PIP-17:-Tiered-storage-for-Pulsar-topics). diff --git a/site2/website/versioned_docs/version-2.9.x/transaction-api.md b/site2/website/versioned_docs/version-2.9.x/transaction-api.md new file mode 100644 index 0000000000000..ecbd0da12c786 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/transaction-api.md @@ -0,0 +1,172 @@ +--- +id: transactions-api +title: Transactions API +sidebar_label: "Transactions API" +original_id: transactions-api +--- + +All messages in a transaction are available only to consumers after the transaction has been committed. If a transaction has been aborted, all the writes and acknowledgments in this transaction roll back. + +## Prerequisites +1. To enable transactions in Pulsar, you need to configure the parameter in the `broker.conf` file. + +``` + +transactionCoordinatorEnabled=true + +``` + +2. Initialize transaction coordinator metadata, so the transaction coordinators can leverage advantages of the partitioned topic, such as load balance. + +``` + +bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + +``` + +After initializing transaction coordinator metadata, you can use the transactions API. The following APIs are available. + +## Initialize Pulsar client + +You can enable transaction for transaction client and initialize transaction coordinator client. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .enableTransaction(true) + .build(); + +``` + +## Start transactions +You can start transaction in the following way. + +``` + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +``` + +## Produce transaction messages + +A transaction parameter is required when producing new transaction messages. The semantic of the transaction messages in Pulsar is `read-committed`, so the consumer cannot receive the ongoing transaction messages before the transaction is committed. + +``` + +producer.newMessage(txn).value("Hello Pulsar Transaction".getBytes()).sendAsync(); + +``` + +## Acknowledge the messages with the transaction + +The transaction acknowledgement requires a transaction parameter. The transaction acknowledgement marks the messages state to pending-ack state. When the transaction is committed, the pending-ack state becomes ack state. If the transaction is aborted, the pending-ack state becomes unack state. + +``` + +Message message = consumer.receive(); +consumer.acknowledgeAsync(message.getMessageId(), txn); + +``` + +## Commit transactions + +When the transaction is committed, consumers receive the transaction messages and the pending-ack state becomes ack state. + +``` + +txn.commit().get(); + +``` + +## Abort transaction + +When the transaction is aborted, the transaction acknowledgement is canceled and the pending-ack messages are redelivered. + +``` + +txn.abort().get(); + +``` + +### Example +The following example shows how messages are processed in transaction. + +``` + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl(getPulsarServiceList().get(0).getBrokerServiceUrl()) + .statsInterval(0, TimeUnit.SECONDS) + .enableTransaction(true) + .build(); + +String sourceTopic = "public/default/source-topic"; +String sinkTopic = "public/default/sink-topic"; + +Producer sourceProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sourceTopic) + .create(); +sourceProducer.newMessage().value("hello pulsar transaction").sendAsync(); + +Consumer sourceConsumer = pulsarClient + .newConsumer(Schema.STRING) + .topic(sourceTopic) + .subscriptionName("test") + .subscriptionType(SubscriptionType.Shared) + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscribe(); + +Producer sinkProducer = pulsarClient + .newProducer(Schema.STRING) + .topic(sinkTopic) + .sendTimeout(0, TimeUnit.MILLISECONDS) + .create(); + +Transaction txn = pulsarClient + .newTransaction() + .withTransactionTimeout(5, TimeUnit.MINUTES) + .build() + .get(); + +// source message acknowledgement and sink message produce belong to one transaction, +// they are combined into an atomic operation. +Message message = sourceConsumer.receive(); +sourceConsumer.acknowledgeAsync(message.getMessageId(), txn); +sinkProducer.newMessage(txn).value("sink data").sendAsync(); + +txn.commit().get(); + +``` + +## Enable batch messages in transactions + +To enable batch messages in transactions, you need to enable the batch index acknowledgement feature. The transaction acks check whether the batch index acknowledgement conflicts. + +To enable batch index acknowledgement, you need to set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + +``` + +acknowledgmentAtBatchIndexLevelEnabled=true + +``` + +And then you need to call the `enableBatchIndexAcknowledgment(true)` method in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + .subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/transaction-guarantee.md b/site2/website/versioned_docs/version-2.9.x/transaction-guarantee.md new file mode 100644 index 0000000000000..9db2d254e159f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/transaction-guarantee.md @@ -0,0 +1,17 @@ +--- +id: transactions-guarantee +title: Transactions Guarantee +sidebar_label: "Transactions Guarantee" +original_id: transactions-guarantee +--- + +Pulsar transactions support the following guarantee. + +## Atomic multi-partition writes and multi-subscription acknowledges +Transactions enable atomic writes to multiple topics and partitions. A batch of messages in a transaction can be received from, produced to, and acknowledged by many partitions. All the operations involved in a transaction succeed or fail as a single unit. + +## Read transactional message +All the messages in a transaction are available only for consumers until the transaction is committed. + +## Acknowledge transactional message +A message is acknowledged successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/txn-how.md b/site2/website/versioned_docs/version-2.9.x/txn-how.md new file mode 100644 index 0000000000000..add072448aeb3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/txn-how.md @@ -0,0 +1,151 @@ +--- +id: txn-how +title: How transactions work? +sidebar_label: "How transactions work?" +original_id: txn-how +--- + +This section describes transaction components and how the components work together. For the complete design details, see [PIP-31: Transactional Streaming](https://docs.google.com/document/d/145VYp09JKTw9jAT-7yNyFU255FptB2_B2Fye100ZXDI/edit#heading=h.bm5ainqxosrx). + +## Key concept + +It is important to know the following key concepts, which is a prerequisite for understanding how transactions work. + +### Transaction coordinator + +The transaction coordinator (TC) is a module running inside a Pulsar broker. + +* It maintains the entire life cycle of transactions and prevents a transaction from getting into an incorrect status. + +* It handles transaction timeout, and ensures that the transaction is aborted after a transaction timeout. + +### Transaction log + +All the transaction metadata persists in the transaction log. The transaction log is backed by a Pulsar topic. If the transaction coordinator crashes, it can restore the transaction metadata from the transaction log. + +The transaction log stores the transaction status rather than actual messages in the transaction (the actual messages are stored in the actual topic partitions). + +### Transaction buffer + +Messages produced to a topic partition within a transaction are stored in the transaction buffer (TB) of that topic partition. The messages in the transaction buffer are not visible to consumers until the transactions are committed. The messages in the transaction buffer are discarded when the transactions are aborted. + +Transaction buffer stores all ongoing and aborted transactions in memory. All messages are sent to the actual partitioned Pulsar topics. After transactions are committed, the messages in the transaction buffer are materialized (visible) to consumers. When the transactions are aborted, the messages in the transaction buffer are discarded. + +### Transaction ID + +Transaction ID (TxnID) identifies a unique transaction in Pulsar. The transaction ID is 128-bit. The highest 16 bits are reserved for the ID of the transaction coordinator, and the remaining bits are used for monotonically increasing numbers in each transaction coordinator. It is easy to locate the transaction crash with the TxnID. + +### Pending acknowledge state + +Pending acknowledge state maintains message acknowledgments within a transaction before a transaction completes. If a message is in the pending acknowledge state, the message cannot be acknowledged by other transactions until the message is removed from the pending acknowledge state. + +The pending acknowledge state is persisted to the pending acknowledge log (cursor ledger). A new broker can restore the state from the pending acknowledge log to ensure the acknowledgement is not lost. + +## Data flow + +At a high level, the data flow can be split into several steps: + +1. Begin a transaction. + +2. Publish messages with a transaction. + +3. Acknowledge messages with a transaction. + +4. End a transaction. + +To help you debug or tune the transaction for better performance, review the following diagrams and descriptions. + +### 1. Begin a transaction + +Before introducing the transaction in Pulsar, a producer is created and then messages are sent to brokers and stored in data logs. + +![](/assets/txn-3.png) + +Let’s walk through the steps for _beginning a transaction_. + +| Step | Description | +| --- | --- | +| 1.1 | The first step is that the Pulsar client finds the transaction coordinator. | +| 1.2 | The transaction coordinator allocates a transaction ID for the transaction. In the transaction log, the transaction is logged with its transaction ID and status (OPEN), which ensures the transaction status is persisted regardless of transaction coordinator crashes. | +| 1.3 | The transaction log sends the result of persisting the transaction ID to the transaction coordinator. | +| 1.4 | After the transaction status entry is logged, the transaction coordinator brings the transaction ID back to the Pulsar client. | + +### 2. Publish messages with a transaction + +In this stage, the Pulsar client enters a transaction loop, repeating the `consume-process-produce` operation for all the messages that comprise the transaction. This is a long phase and is potentially composed of multiple produce and acknowledgement requests. + +![](/assets/txn-4.png) + +Let’s walk through the steps for _publishing messages with a transaction_. + +| Step | Description | +| --- | --- | +| 2.1.1 | Before the Pulsar client produces messages to a new topic partition, it sends a request to the transaction coordinator to add the partition to the transaction. | +| 2.1.2 | The transaction coordinator logs the partition changes of the transaction into the transaction log for durability, which ensures the transaction coordinator knows all the partitions that a transaction is handling. The transaction coordinator can commit or abort changes on each partition at the end-partition phase. | +| 2.1.3 | The transaction log sends the result of logging the new partition (used for producing messages) to the transaction coordinator. | +| 2.1.4 | The transaction coordinator sends the result of adding a new produced partition to the transaction. | +| 2.2.1 | The Pulsar client starts producing messages to partitions. The flow of this part is the same as the normal flow of producing messages except that the batch of messages produced by a transaction contains transaction IDs. | +| 2.2.2 | The broker writes messages to a partition. | + +### 3. Acknowledge messages with a transaction + +In this phase, the Pulsar client sends a request to the transaction coordinator and a new subscription is acknowledged as a part of a transaction. + +![](/assets/txn-5.png) + +Let’s walk through the steps for _acknowledging messages with a transaction_. + +| Step | Description | +| --- | --- | +| 3.1.1 | The Pulsar client sends a request to add an acknowledged subscription to the transaction coordinator. | +| 3.1.2 | The transaction coordinator logs the addition of subscription, which ensures that it knows all subscriptions handled by a transaction and can commit or abort changes on each subscription at the end phase. | +| 3.1.3 | The transaction log sends the result of logging the new partition (used for acknowledging messages) to the transaction coordinator. | +| 3.1.4 | The transaction coordinator sends the result of adding the new acknowledged partition to the transaction. | +| 3.2 | The Pulsar client acknowledges messages on the subscription. The flow of this part is the same as the normal flow of acknowledging messages except that the acknowledged request carries a transaction ID. | +| 3.3 | The broker receiving the acknowledgement request checks if the acknowledgment belongs to a transaction or not. | + +### 4. End a transaction + +At the end of a transaction, the Pulsar client decides to commit or abort the transaction. The transaction can be aborted when a conflict is detected on acknowledging messages. + +#### 4.1 End transaction request + +When the Pulsar client finishes a transaction, it issues an end transaction request. + +![](/assets/txn-6.png) + +Let’s walk through the steps for _ending the transaction_. + +| Step | Description | +| --- | --- | +| 4.1.1 | The Pulsar client issues an end transaction request (with a field indicating whether the transaction is to be committed or aborted) to the transaction coordinator. | +| 4.1.2 | The transaction coordinator writes a COMMITTING or ABORTING message to its transaction log. | +| 4.1.3 | The transaction log sends the result of logging the committing or aborting status. | + +#### 4.2 Finalize a transaction + +The transaction coordinator starts the process of committing or aborting messages to all the partitions involved in this transaction. + +![](/assets/txn-7.png) + +Let’s walk through the steps for _finalizing a transaction_. + +| Step | Description | +| --- | --- | +| 4.2.1 | The transaction coordinator commits transactions on subscriptions and commits transactions on partitions at the same time. | +| 4.2.2 | The broker (produce) writes produced committed markers to the actual partitions. At the same time, the broker (ack) writes acked committed marks to the subscription pending ack partitions. | +| 4.2.3 | The data log sends the result of writing produced committed marks to the broker. At the same time, pending ack data log sends the result of writing acked committed marks to the broker. The cursor moves to the next position. | + +#### 4.3 Mark a transaction as COMMITTED or ABORTED + +The transaction coordinator writes the final transaction status to the transaction log to complete the transaction. + +![](/assets/txn-8.png) + +Let’s walk through the steps for _marking a transaction as COMMITTED or ABORTED_. + +| Step | Description | +| --- | --- | +| 4.3.1 | After all produced messages and acknowledgements to all partitions involved in this transaction have been successfully committed or aborted, the transaction coordinator writes the final COMMITTED or ABORTED transaction status messages to its transaction log, indicating that the transaction is complete. All the messages associated with the transaction in its transaction log can be safely removed. | +| 4.3.2 | The transaction log sends the result of the committed transaction to the transaction coordinator. | +| 4.3.3 | The transaction coordinator sends the result of the committed transaction to the Pulsar client. | diff --git a/site2/website/versioned_docs/version-2.9.x/txn-monitor.md b/site2/website/versioned_docs/version-2.9.x/txn-monitor.md new file mode 100644 index 0000000000000..5b50953772d09 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/txn-monitor.md @@ -0,0 +1,10 @@ +--- +id: txn-monitor +title: How to monitor transactions? +sidebar_label: "How to monitor transactions?" +original_id: txn-monitor +--- + +You can monitor the status of the transactions in Prometheus and Grafana using the [transaction metrics](https://pulsar.apache.org/docs/en/next/reference-metrics/#pulsar-transaction). + +For how to configure Prometheus and Grafana, see [here](https://pulsar.apache.org/docs/en/next/deploy-monitoring). diff --git a/site2/website/versioned_docs/version-2.9.x/txn-use.md b/site2/website/versioned_docs/version-2.9.x/txn-use.md new file mode 100644 index 0000000000000..a16ea7140da76 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/txn-use.md @@ -0,0 +1,105 @@ +--- +id: txn-use +title: How to use transactions? +sidebar_label: "How to use transactions?" +original_id: txn-use +--- + +## Transaction API + +The transaction feature is primarily a server-side and protocol-level feature. You can use the transaction feature via the [transaction API](https://pulsar.apache.org/api/admin/), which is available in **Pulsar 2.8.0 or later**. + +To use the transaction API, you do not need any additional settings in the Pulsar client. **By default**, transactions is **disabled**. + +Currently, transaction API is only available for **Java** clients. Support for other language clients will be added in the future releases. + +## Quick start + +This section provides an example of how to use the transaction API to send and receive messages in a Java client. + +1. Start Pulsar 2.8.0 or later. + +2. Enable transaction. + + Change the configuration in the `broker.conf` file. + + ``` + + transactionCoordinatorEnabled=true + + ``` + + If you want to enable batch messages in transactions, follow the steps below. + + Set `acknowledgmentAtBatchIndexLevelEnabled` to `true` in the `broker.conf` or `standalone.conf` file. + + ``` + + acknowledgmentAtBatchIndexLevelEnabled=true + + ``` + +3. Initialize transaction coordinator metadata. + + The transaction coordinator can leverage the advantages of partitioned topics (such as load balance). + + **Input** + + ``` + + bin/pulsar initialize-transaction-coordinator-metadata -cs 127.0.0.1:2181 -c standalone + + ``` + + **Output** + + ``` + + Transaction coordinator metadata setup success + + ``` + +4. Initialize a Pulsar client. + + ``` + + PulsarClient client = PulsarClient.builder() + + .serviceUrl(“pulsar://localhost:6650”) + + .enableTransaction(true) + + .build(); + + ``` + +Now you can start using the transaction API to send and receive messages. Below is an example of a `consume-process-produce` application written in Java. + +![](/assets/txn-9.png) + +Let’s walk through this example step by step. + +| Step | Description | +| --- | --- | +| 1. Start a transaction. | The application opens a new transaction by calling PulsarClient.newTransaction. It specifics the transaction timeout as 1 minute. If the transaction is not committed within 1 minute, the transaction is automatically aborted. | +| 2. Receive messages from topics. | The application creates two normal consumers to receive messages from topic input-topic-1 and input-topic-2 respectively. | +| 3. Publish messages to topics with the transaction. | The application creates two producers to produce the resulting messages to the output topic _output-topic-1_ and output-topic-2 respectively. The application applies the processing logic and generates two output messages. The application sends those two output messages as part of the transaction opened in the first step via Producer.newMessage(Transaction). | +| 4. Acknowledge the messages with the transaction. | In the same transaction, the application acknowledges the two input messages. | +| 5. Commit the transaction. | The application commits the transaction by calling Transaction.commit() on the open transaction. The commit operation ensures the two input messages are marked as acknowledged and the two output messages are written successfully to the output topics. | + +[1] Example of enabling batch messages ack in transactions in the consumer builder. + +``` + +Consumer sinkConsumer = pulsarClient + .newConsumer() + .topic(transferTopic) + .subscriptionName("sink-topic") + +.subscriptionInitialPosition(SubscriptionInitialPosition.Earliest) + .subscriptionType(SubscriptionType.Shared) + .enableBatchIndexAcknowledgment(true) // enable batch index acknowledgement + .subscribe(); + +``` + diff --git a/site2/website/versioned_docs/version-2.9.x/txn-what.md b/site2/website/versioned_docs/version-2.9.x/txn-what.md new file mode 100644 index 0000000000000..844f19a700f8f --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/txn-what.md @@ -0,0 +1,60 @@ +--- +id: txn-what +title: What are transactions? +sidebar_label: "What are transactions?" +original_id: txn-what +--- + +Transactions strengthen the message delivery semantics of Apache Pulsar and [processing guarantees of Pulsar Functions](https://pulsar.apache.org/docs/en/next/functions-overview/#processing-guarantees). The Pulsar Transaction API supports atomic writes and acknowledgments across multiple topics. + +Transactions allow: + +- A producer to send a batch of messages to multiple topics where all messages in the batch are eventually visible to any consumer, or none are ever visible to consumers. + +- End-to-end exactly-once semantics (execute a `consume-process-produce` operation exactly once). + +## Transaction semantics + +Pulsar transactions have the following semantics: + +* All operations within a transaction are committed as a single unit. + + * Either all messages are committed, or none of them are. + + * Each message is written or processed exactly once, without data loss or duplicates (even in the event of failures). + + * If a transaction is aborted, all the writes and acknowledgments in this transaction rollback. + +* A group of messages in a transaction can be received from, produced to, and acknowledged by multiple partitions. + + * Consumers are only allowed to read committed (acked) messages. In other words, the broker does not deliver transactional messages which are part of an open transaction or messages which are part of an aborted transaction. + + * Message writes across multiple partitions are atomic. + + * Message acks across multiple subscriptions are atomic. A message is acked successfully only once by a consumer under the subscription when acknowledging the message with the transaction ID. + +## Transactions and stream processing + +Stream processing on Pulsar is a `consume-process-produce` operation on Pulsar topics: + +* `Consume`: a source operator that runs a Pulsar consumer reads messages from one or multiple Pulsar topics. + +* `Process`: a processing operator transforms the messages. + +* `Produce`: a sink operator that runs a Pulsar producer writes the resulting messages to one or multiple Pulsar topics. + +![](/assets/txn-2.png) + +Pulsar transactions support end-to-end exactly-once stream processing, which means messages are not lost from a source operator and messages are not duplicated to a sink operator. + +## Use case + +Prior to Pulsar 2.8.0, there was no easy way to build stream processing applications with Pulsar to achieve exactly-once processing guarantees. With the transaction introduced in Pulsar 2.8.0, the following services support exactly-once semantics: + +* [Pulsar Flink connector](https://flink.apache.org/2021/01/07/pulsar-flink-connector-270.html) + + Prior to Pulsar 2.8.0, if you want to build stream applications using Pulsar and Flink, the Pulsar Flink connector only supported exactly-once source connector and at-least-once sink connector, which means the highest processing guarantee for end-to-end was at-least-once, there was possibility that the resulting messages from streaming applications produce duplicated messages to the resulting topics in Pulsar. + + With the transaction introduced in Pulsar 2.8.0, the Pulsar Flink sink connector can support exactly-once semantics by implementing the designated `TwoPhaseCommitSinkFunction` and hooking up the Flink sink message lifecycle with Pulsar transaction API. + +* Support for Pulsar Functions and other connectors will be added in the future releases. diff --git a/site2/website/versioned_docs/version-2.9.x/txn-why.md b/site2/website/versioned_docs/version-2.9.x/txn-why.md new file mode 100644 index 0000000000000..1ed8769977654 --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/txn-why.md @@ -0,0 +1,45 @@ +--- +id: txn-why +title: Why transactions? +sidebar_label: "Why transactions?" +original_id: txn-why +--- + +Pulsar transactions (txn) enable event streaming applications to consume, process, and produce messages in one atomic operation. The reason for developing this feature can be summarized as below. + +## Demand of stream processing + +The demand for stream processing applications with stronger processing guarantees has grown along with the rise of stream processing. For example, in the financial industry, financial institutions use stream processing engines to process debits and credits for users. This type of use case requires that every message is processed exactly once, without exception. + +In other words, if a stream processing application consumes message A and +produces the result as a message B (B = f(A)), then exactly-once processing +guarantee means that A can only be marked as consumed if and only if B is +successfully produced, and vice versa. + +![](/assets/txn-1.png) + +The Pulsar transactions API strengthens the message delivery semantics and the processing guarantees for stream processing. It enables stream processing applications to consume, process, and produce messages in one atomic operation. That means, a batch of messages in a transaction can be received from, produced to and acknowledged by many topic partitions. All the operations involved in a transaction succeed or fail as one single until. + +## Limitation of idempotent producer + +Avoiding data loss or duplication can be achieved by using the Pulsar idempotent producer, but it does not provide guarantees for writes across multiple partitions. + +In Pulsar, the highest level of message delivery guarantee is using an [idempotent producer](https://pulsar.apache.org/docs/en/next/concepts-messaging/#producer-idempotency) with the exactly once semantic at one single partition, that is, each message is persisted exactly once without data loss and duplication. However, there are some limitations in this solution: + +- Due to the monotonic increasing sequence ID, this solution only works on a single partition and within a single producer session (that is, for producing one message), so there is no atomicity when producing multiple messages to one or multiple partitions. + + In this case, if there are some failures (for example, client / broker / bookie crashes, network failure, and more) in the process of producing and receiving messages, messages are re-processed and re-delivered, which may cause data loss or data duplication: + + - For the producer: if the producer retry sending messages, some messages are persisted multiple times; if the producer does not retry sending messages, some messages are persisted once and other messages are lost. + + - For the consumer: since the consumer does not know whether the broker has received messages or not, the consumer may not retry sending acks, which causes it to receive duplicate messages. + +- Similarly, for Pulsar Function, it only guarantees exactly once semantics for an idempotent function on a single event rather than processing multiple events or producing multiple results that can happen exactly. + + For example, if a function accepts multiple events and produces one result (for example, window function), the function may fail between producing the result and acknowledging the incoming messages, or even between acknowledging individual events, which causes all (or some) incoming messages to be re-delivered and reprocessed, and a new result is generated. + + However, many scenarios need atomic guarantees across multiple partitions and sessions. + +- Consumers need to rely on more mechanisms to acknowledge (ack) messages once. + + For example, consumers are required to store the MessageID along with its acked state. After the topic is unloaded, the subscription can recover the acked state of this MessageID in memory when the topic is loaded again. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.9.x/window-functions-context.md b/site2/website/versioned_docs/version-2.9.x/window-functions-context.md new file mode 100644 index 0000000000000..f80fea57989ef --- /dev/null +++ b/site2/website/versioned_docs/version-2.9.x/window-functions-context.md @@ -0,0 +1,581 @@ +--- +id: window-functions-context +title: Window Functions Context +sidebar_label: "Window Functions: Context" +original_id: window-functions-context +--- + +Java SDK provides access to a **window context object** that can be used by a window function. This context object provides a wide variety of information and functionality for Pulsar window functions as below. + +- [Spec](#spec) + + * Names of all input topics and the output topic associated with the function. + * Tenant and namespace associated with the function. + * Pulsar window function name, ID, and version. + * ID of the Pulsar function instance running the window function. + * Number of instances that invoke the window function. + * Built-in type or custom class name of the output schema. + +- [Logger](#logger) + + * Logger object used by the window function, which can be used to create window function log messages. + +- [User config](#user-config) + + * Access to arbitrary user configuration values. + +- [Routing](#routing) + + * Routing is supported in Pulsar window functions. Pulsar window functions send messages to arbitrary topics as per the `publish` interface. + +- [Metrics](#metrics) + + * Interface for recording metrics. + +- [State storage](#state-storage) + + * Interface for storing and retrieving state in [state storage](#state-storage). + +## Spec + +Spec contains the basic information of a function. + +### Get input topics + +The `getInputTopics` method gets the **name list** of all input topics. + +This example demonstrates how to get the name list of all input topics in a Java window function. + +```java + +public class GetInputTopicsWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Collection inputTopics = context.getInputTopics(); + System.out.println(inputTopics); + + return null; + } + +} + +``` + +### Get output topic + +The `getOutputTopic` method gets the **name of a topic** to which the message is sent. + +This example demonstrates how to get the name of an output topic in a Java window function. + +```java + +public class GetOutputTopicWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String outputTopic = context.getOutputTopic(); + System.out.println(outputTopic); + + return null; + } +} + +``` + +### Get tenant + +The `getTenant` method gets the tenant name associated with the window function. + +This example demonstrates how to get the tenant name in a Java window function. + +```java + +public class GetTenantWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String tenant = context.getTenant(); + System.out.println(tenant); + + return null; + } + +} + +``` + +### Get namespace + +The `getNamespace` method gets the namespace associated with the window function. + +This example demonstrates how to get the namespace in a Java window function. + +```java + +public class GetNamespaceWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String ns = context.getNamespace(); + System.out.println(ns); + + return null; + } + +} + +``` + +### Get function name + +The `getFunctionName` method gets the window function name. + +This example demonstrates how to get the function name in a Java window function. + +```java + +public class GetNameOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionName = context.getFunctionName(); + System.out.println(functionName); + + return null; + } + +} + +``` + +### Get function ID + +The `getFunctionId` method gets the window function ID. + +This example demonstrates how to get the function ID in a Java window function. + +```java + +public class GetFunctionIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionID = context.getFunctionId(); + System.out.println(functionID); + + return null; + } + +} + +``` + +### Get function version + +The `getFunctionVersion` method gets the window function version. + +This example demonstrates how to get the function version of a Java window function. + +```java + +public class GetVersionOfWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String functionVersion = context.getFunctionVersion(); + System.out.println(functionVersion); + + return null; + } + +} + +``` + +### Get instance ID + +The `getInstanceId` method gets the instance ID of a window function. + +This example demonstrates how to get the instance ID in a Java window function. + +```java + +public class GetInstanceIDWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int instanceId = context.getInstanceId(); + System.out.println(instanceId); + + return null; + } + +} + +``` + +### Get num instances + +The `getNumInstances` method gets the number of instances that invoke the window function. + +This example demonstrates how to get the number of instances in a Java window function. + +```java + +public class GetNumInstancesWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + int numInstances = context.getNumInstances(); + System.out.println(numInstances); + + return null; + } + +} + +``` + +### Get output schema type + +The `getOutputSchemaType` method gets the built-in type or custom class name of the output schema. + +This example demonstrates how to get the output schema type of a Java window function. + +```java + +public class GetOutputSchemaTypeWindowFunction implements WindowFunction { + + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + String schemaType = context.getOutputSchemaType(); + System.out.println(schemaType); + + return null; + } +} + +``` + +## Logger + +Pulsar window functions using Java SDK has access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. + +This example logs either a `WARNING`-level or `INFO`-level log based on whether the incoming string contains the word `danger` or not in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; +import org.slf4j.Logger; + +public class LoggingWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + Logger log = context.getLogger(); + for (Record record : inputs) { + log.info(record + "-window-log"); + } + return null; + } + +} + +``` + +If you need your function to produce logs, specify a log topic when creating or running the function. + +```bash + +bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +You can access all logs produced by `LoggingFunction` via the `persistent://public/default/logging-function-logs` topic. + +## Metrics + +Pulsar window functions can publish arbitrary metrics to the metrics interface which can be queried. + +:::note + +If a Pulsar window function uses the language-native interface for Java, that function is not able to publish metrics and stats to Pulsar. + +::: + +You can record metrics using the context object on a per-key basis. + +This example sets a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message in a Java function. + +```java + +import java.util.Collection; +import org.apache.pulsar.functions.api.Record; +import org.apache.pulsar.functions.api.WindowContext; +import org.apache.pulsar.functions.api.WindowFunction; + + +/** + * Example function that wants to keep track of + * the event time of each message sent. + */ +public class UserMetricWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + + for (Record record : inputs) { + if (record.getEventTime().isPresent()) { + context.recordMetric("MessageEventTime", record.getEventTime().get().doubleValue()); + } + } + + return null; + } +} + +``` + +## User config + +When you run or update Pulsar Functions that are created using SDK, you can pass arbitrary key/value pairs to them with the `--user-config` flag. Key/value pairs **must** be specified as JSON. + +This example passes a user configured key/value to a function. + +```bash + +bin/pulsar-admin functions create \ + --name word-filter \ + --user-config '{"forbidden-word":"rosebud"}' \ + # Other function configs + +``` + +### API +You can use the following APIs to get user-defined information for window functions. +#### getUserConfigMap + +`getUserConfigMap` API gets a map of all user-defined key/value configurations for the window function. + +```java + +/** + * Get a map of all user-defined key/value configs for the function. + * + * @return The full map of user-defined config values + */ + Map getUserConfigMap(); + +``` + +#### getUserConfigValue + +The `getUserConfigValue` API gets a user-defined key/value. + +```java + +/** + * Get any user-defined key/value. + * + * @param key The key + * @return The Optional value specified by the user for that key. + */ + Optional getUserConfigValue(String key); + +``` + +#### getUserConfigValueOrDefault + +The `getUserConfigValueOrDefault` API gets a user-defined key/value or a default value if none is present. + +```java + +/** + * Get any user-defined key/value or a default value if none is present. + * + * @param key + * @param defaultValue + * @return Either the user config value associated with a given key or a supplied default value + */ + Object getUserConfigValueOrDefault(String key, Object defaultValue); + +``` + +This example demonstrates how to access key/value pairs provided to Pulsar window functions. + +Java SDK context object enables you to access key/value pairs provided to Pulsar window functions via the command line (as JSON). + +:::tip + +For all key/value pairs passed to Java window functions, both the `key` and the `value` are `String`. To set the value to be a different type, you need to deserialize it from the `String` type. + +::: + +This example passes a key/value pair in a Java window function. + +```bash + +bin/pulsar-admin functions create \ + --user-config '{"word-of-the-day":"verdure"}' \ + # Other function configs + +``` + +This example accesses values in a Java window function. + +The `UserConfigFunction` function logs the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The user config of `word-of-the-day` is changed **only** when the function is updated with a new config value via +multiple ways, such as the command line tool or REST API. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigWindowFunction implements WindowFunction { + @Override + public String process(Collection> input, WindowContext context) throws Exception { + Optional whatToWrite = context.getUserConfigValue("WhatToWrite"); + if (whatToWrite.get() != null) { + return (String)whatToWrite.get(); + } else { + return "Not a nice way"; + } + } + +} + +``` + +If no value is provided, you can access the entire user config map or set a default value. + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + +## Routing + +You can use the `context.publish()` interface to publish as many results as you want. + +This example shows that the `PublishFunction` class uses the built-in function in the context to publish messages to the `publishTopic` in a Java function. + +```java + +public class PublishWindowFunction implements WindowFunction { + @Override + public Void process(Collection> input, WindowContext context) throws Exception { + String publishTopic = (String) context.getUserConfigValueOrDefault("publish-topic", "publishtopic"); + String output = String.format("%s!", input); + context.publish(publishTopic, output); + + return null; + } + +} + +``` + +## State storage + +Pulsar window functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Apache Pulsar installation (including the standalone installation) includes the deployment of BookKeeper bookies. + +Apache Pulsar integrates with Apache BookKeeper `table service` to store the `state` for functions. For example, the `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions state APIs. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data—counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function and shared between instances of that function. + +Currently, Pulsar window functions expose Java API to access, update, and manage states. These APIs are available in the context object when you use Java SDK functions. + +| Java API| Description +|---|--- +|`incrCounter`|Increases a built-in distributed counter referred by key. +|`getCounter`|Gets the counter value for the key. +|`putState`|Updates the state value for the key. + +You can use the following APIs to access, update, and manage states in Java window functions. + +#### incrCounter + +The `incrCounter` API increases a built-in distributed counter referred by key. + +Applications use the `incrCounter` API to change the counter of a given `key` by the given `amount`. If the `key` does not exist, a new key is created. + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + +#### getCounter + +The `getCounter` API gets the counter value for the key. + +Applications uses the `getCounter` API to retrieve the counter of a given `key` changed by the `incrCounter` API. + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + +Except the `getCounter` API, Pulsar also exposes a general key/value API (`putState`) for functions to store general key/value state. + +#### putState + +The `putState` API updates the state value for the key. + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + +This example demonstrates how applications store states in Pulsar window functions. + +The logic of the `WordCountWindowFunction` is simple and straightforward. + +1. The function first splits the received string into multiple words using regex `\\.`. + +2. For each `word`, the function increments the corresponding `counter` by 1 via `incrCounter(key, amount)`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountWindowFunction implements WindowFunction { + @Override + public Void process(Collection> inputs, WindowContext context) throws Exception { + for (Record input : inputs) { + Arrays.asList(input.getValue().split("\\.")).forEach(word -> context.incrCounter(word, 1)); + } + return null; + + } +} + +``` + diff --git a/site2/website/versioned_sidebars/version-2.10.x-sidebars.json b/site2/website/versioned_sidebars/version-2.10.x-sidebars.json new file mode 100644 index 0000000000000..7c0ced73f08fb --- /dev/null +++ b/site2/website/versioned_sidebars/version-2.10.x-sidebars.json @@ -0,0 +1,610 @@ +{ + "version-2.10.x/docsSidebar": [ + { + "type": "doc", + "id": "version-2.10.x/about" + }, + { + "type": "category", + "label": "Get Started", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/getting-started-standalone" + }, + { + "type": "doc", + "id": "version-2.10.x/getting-started-docker" + }, + { + "type": "doc", + "id": "version-2.10.x/getting-started-helm" + } + ] + }, + { + "type": "category", + "label": "Concepts and Architecture", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/concepts-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-messaging" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-architecture-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-clients" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-replication" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-multi-tenancy" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-authentication" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-topic-compaction" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-proxy-sni-routing" + }, + { + "type": "doc", + "id": "version-2.10.x/concepts-multiple-advertised-listeners" + } + ] + }, + { + "type": "category", + "label": "Pulsar Schema", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/schema-get-started" + }, + { + "type": "doc", + "id": "version-2.10.x/schema-understand" + }, + { + "type": "doc", + "id": "version-2.10.x/schema-evolution-compatibility" + }, + { + "type": "doc", + "id": "version-2.10.x/schema-manage" + } + ] + }, + { + "type": "category", + "label": "Pulsar Functions", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/functions-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-runtime" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-worker" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-develop" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-package" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-debug" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-deploy" + }, + { + "type": "doc", + "id": "version-2.10.x/functions-cli" + }, + { + "type": "doc", + "id": "version-2.10.x/window-functions-context" + } + ] + }, + { + "type": "category", + "label": "Pulsar IO", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/io-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/io-quickstart" + }, + { + "type": "doc", + "id": "version-2.10.x/io-use" + }, + { + "type": "doc", + "id": "version-2.10.x/io-debug" + }, + { + "type": "doc", + "id": "version-2.10.x/io-connectors" + }, + { + "type": "doc", + "id": "version-2.10.x/io-cdc" + }, + { + "type": "doc", + "id": "version-2.10.x/io-develop" + } + ] + }, + { + "type": "category", + "label": "Pulsar SQL", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/sql-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/sql-getting-started" + }, + { + "type": "doc", + "id": "version-2.10.x/sql-deployment-configurations" + }, + { + "type": "doc", + "id": "version-2.10.x/sql-rest-api" + } + ] + }, + { + "type": "category", + "label": "Tiered Storage", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-aws" + }, + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-gcs" + }, + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-filesystem" + }, + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-azure" + }, + { + "type": "doc", + "id": "version-2.10.x/tiered-storage-aliyun" + } + ] + }, + { + "type": "category", + "label": "Transactions", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/txn-why" + }, + { + "type": "doc", + "id": "version-2.10.x/txn-what" + }, + { + "type": "doc", + "id": "version-2.10.x/txn-how" + }, + { + "type": "doc", + "id": "version-2.10.x/txn-use" + }, + { + "type": "doc", + "id": "version-2.10.x/txn-monitor" + } + ] + }, + { + "type": "category", + "label": "Kubernetes (Helm)", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/helm-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/helm-prepare" + }, + { + "type": "doc", + "id": "version-2.10.x/helm-install" + }, + { + "type": "doc", + "id": "version-2.10.x/helm-deploy" + }, + { + "type": "doc", + "id": "version-2.10.x/helm-upgrade" + }, + { + "type": "doc", + "id": "version-2.10.x/helm-tools" + } + ] + }, + { + "type": "category", + "label": "Deployment", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/deploy-aws" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-kubernetes" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-bare-metal" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-bare-metal-multi-cluster" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-dcos" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-docker" + }, + { + "type": "doc", + "id": "version-2.10.x/deploy-monitoring" + } + ] + }, + { + "type": "category", + "label": "Administration", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/administration-zk-bk" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-geo" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-pulsar-manager" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-stats" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-load-balance" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-proxy" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-upgrade" + }, + { + "type": "doc", + "id": "version-2.10.x/administration-isolation" + } + ] + }, + { + "type": "category", + "label": "Security", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/security-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/security-policy-and-supported-versions" + }, + { + "type": "doc", + "id": "version-2.10.x/security-tls-transport" + }, + { + "type": "doc", + "id": "version-2.10.x/security-tls-authentication" + }, + { + "type": "doc", + "id": "version-2.10.x/security-tls-keystore" + }, + { + "type": "doc", + "id": "version-2.10.x/security-jwt" + }, + { + "type": "doc", + "id": "version-2.10.x/security-athenz" + }, + { + "type": "doc", + "id": "version-2.10.x/security-kerberos" + }, + { + "type": "doc", + "id": "version-2.10.x/security-oauth2" + }, + { + "type": "doc", + "id": "version-2.10.x/security-basic-auth" + }, + { + "type": "doc", + "id": "version-2.10.x/security-authorization" + }, + { + "type": "doc", + "id": "version-2.10.x/security-encryption" + }, + { + "type": "doc", + "id": "version-2.10.x/security-extending" + }, + { + "type": "doc", + "id": "version-2.10.x/security-bouncy-castle" + } + ] + }, + { + "type": "category", + "label": "Performance", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/performance-pulsar-perf" + } + ] + }, + { + "type": "category", + "label": "Client Libraries", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/client-libraries" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-java" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-go" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-python" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-cpp" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-node" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-websocket" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-dotnet" + }, + { + "type": "doc", + "id": "version-2.10.x/client-libraries-rest" + } + ] + }, + { + "type": "category", + "label": "Admin API", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/admin-api-overview" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-clusters" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-tenants" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-brokers" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-namespaces" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-permissions" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-topics" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-functions" + }, + { + "type": "doc", + "id": "version-2.10.x/admin-api-packages" + } + ] + }, + { + "type": "category", + "label": "Adaptors", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/adaptors-kafka" + }, + { + "type": "doc", + "id": "version-2.10.x/adaptors-spark" + }, + { + "type": "doc", + "id": "version-2.10.x/adaptors-storm" + } + ] + }, + { + "type": "category", + "label": "Cookbooks", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/cookbooks-compaction" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-deduplication" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-non-persistent" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-retention-expiry" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-encryption" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-message-queue" + }, + { + "type": "doc", + "id": "version-2.10.x/cookbooks-bookkeepermetadata" + } + ] + }, + { + "type": "category", + "label": "Development", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/develop-tools" + }, + { + "type": "doc", + "id": "version-2.10.x/developing-binary-protocol" + }, + { + "type": "doc", + "id": "version-2.10.x/develop-schema" + }, + { + "type": "doc", + "id": "version-2.10.x/develop-load-manager" + }, + { + "type": "doc", + "id": "version-2.10.x/develop-plugin" + } + ] + }, + { + "type": "category", + "label": "Reference", + "items": [ + { + "type": "doc", + "id": "version-2.10.x/reference-terminology" + }, + { + "type": "doc", + "id": "version-2.10.x/reference-cli-tools" + }, + { + "type": "doc", + "id": "version-2.10.x/reference-configuration" + }, + { + "type": "doc", + "id": "version-2.10.x/reference-metrics" + }, + { + "type": "doc", + "id": "version-2.10.x/reference-rest-api-overview" + } + ] + } + ] +} \ No newline at end of file diff --git a/site2/website/versioned_sidebars/version-2.8.x-sidebars.json b/site2/website/versioned_sidebars/version-2.8.x-sidebars.json new file mode 100644 index 0000000000000..9dfb92bd232ea --- /dev/null +++ b/site2/website/versioned_sidebars/version-2.8.x-sidebars.json @@ -0,0 +1,598 @@ +{ + "version-2.8.x/docsSidebar": [ + { + "type": "doc", + "id": "version-2.8.x/about" + }, + { + "type": "category", + "label": "Get Started", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/getting-started-standalone" + }, + { + "type": "doc", + "id": "version-2.8.x/getting-started-docker" + }, + { + "type": "doc", + "id": "version-2.8.x/getting-started-helm" + } + ] + }, + { + "type": "category", + "label": "Concepts and Architecture", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/concepts-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-messaging" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-architecture-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-clients" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-replication" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-multi-tenancy" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-authentication" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-topic-compaction" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-proxy-sni-routing" + }, + { + "type": "doc", + "id": "version-2.8.x/concepts-multiple-advertised-listeners" + } + ] + }, + { + "type": "category", + "label": "Pulsar Schema", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/schema-get-started" + }, + { + "type": "doc", + "id": "version-2.8.x/schema-understand" + }, + { + "type": "doc", + "id": "version-2.8.x/schema-evolution-compatibility" + }, + { + "type": "doc", + "id": "version-2.8.x/schema-manage" + } + ] + }, + { + "type": "category", + "label": "Pulsar Functions", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/functions-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-runtime" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-worker" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-develop" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-package" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-debug" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-deploy" + }, + { + "type": "doc", + "id": "version-2.8.x/functions-cli" + }, + { + "type": "doc", + "id": "version-2.8.x/window-functions-context" + } + ] + }, + { + "type": "category", + "label": "Pulsar IO", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/io-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/io-quickstart" + }, + { + "type": "doc", + "id": "version-2.8.x/io-use" + }, + { + "type": "doc", + "id": "version-2.8.x/io-debug" + }, + { + "type": "doc", + "id": "version-2.8.x/io-connectors" + }, + { + "type": "doc", + "id": "version-2.8.x/io-cdc" + }, + { + "type": "doc", + "id": "version-2.8.x/io-develop" + }, + { + "type": "doc", + "id": "version-2.8.x/io-cli" + } + ] + }, + { + "type": "category", + "label": "Pulsar SQL", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/sql-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/sql-getting-started" + }, + { + "type": "doc", + "id": "version-2.8.x/sql-deployment-configurations" + }, + { + "type": "doc", + "id": "version-2.8.x/sql-rest-api" + } + ] + }, + { + "type": "category", + "label": "Tiered Storage", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-aws" + }, + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-gcs" + }, + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-filesystem" + }, + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-azure" + }, + { + "type": "doc", + "id": "version-2.8.x/tiered-storage-aliyun" + } + ] + }, + { + "type": "category", + "label": "Transactions", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/txn-why" + }, + { + "type": "doc", + "id": "version-2.8.x/txn-what" + }, + { + "type": "doc", + "id": "version-2.8.x/txn-how" + }, + { + "type": "doc", + "id": "version-2.8.x/txn-use" + }, + { + "type": "doc", + "id": "version-2.8.x/txn-monitor" + } + ] + }, + { + "type": "category", + "label": "Kubernetes (Helm)", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/helm-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/helm-prepare" + }, + { + "type": "doc", + "id": "version-2.8.x/helm-install" + }, + { + "type": "doc", + "id": "version-2.8.x/helm-deploy" + }, + { + "type": "doc", + "id": "version-2.8.x/helm-upgrade" + }, + { + "type": "doc", + "id": "version-2.8.x/helm-tools" + } + ] + }, + { + "type": "category", + "label": "Deployment", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/deploy-aws" + }, + { + "type": "doc", + "id": "version-2.8.x/deploy-kubernetes" + }, + { + "type": "doc", + "id": "version-2.8.x/deploy-bare-metal" + }, + { + "type": "doc", + "id": "version-2.8.x/deploy-bare-metal-multi-cluster" + }, + { + "type": "doc", + "id": "version-2.8.x/deploy-docker" + }, + { + "type": "doc", + "id": "version-2.8.x/deploy-monitoring" + } + ] + }, + { + "type": "category", + "label": "Administration", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/administration-zk-bk" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-geo" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-pulsar-manager" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-stats" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-load-balance" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-proxy" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-upgrade" + }, + { + "type": "doc", + "id": "version-2.8.x/administration-isolation" + } + ] + }, + { + "type": "category", + "label": "Security", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/security-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/security-tls-transport" + }, + { + "type": "doc", + "id": "version-2.8.x/security-tls-authentication" + }, + { + "type": "doc", + "id": "version-2.8.x/security-tls-keystore" + }, + { + "type": "doc", + "id": "version-2.8.x/security-jwt" + }, + { + "type": "doc", + "id": "version-2.8.x/security-athenz" + }, + { + "type": "doc", + "id": "version-2.8.x/security-kerberos" + }, + { + "type": "doc", + "id": "version-2.8.x/security-oauth2" + }, + { + "type": "doc", + "id": "version-2.8.x/security-basic-auth" + }, + { + "type": "doc", + "id": "version-2.8.x/security-authorization" + }, + { + "type": "doc", + "id": "version-2.8.x/security-encryption" + }, + { + "type": "doc", + "id": "version-2.8.x/security-extending" + }, + { + "type": "doc", + "id": "version-2.8.x/security-bouncy-castle" + } + ] + }, + { + "type": "category", + "label": "Performance", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/performance-pulsar-perf" + } + ] + }, + { + "type": "category", + "label": "Client Libraries", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/client-libraries" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-java" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-go" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-python" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-cpp" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-node" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-websocket" + }, + { + "type": "doc", + "id": "version-2.8.x/client-libraries-dotnet" + } + ] + }, + { + "type": "category", + "label": "Admin API", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/admin-api-overview" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-clusters" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-tenants" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-brokers" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-namespaces" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-permissions" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-topics" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-functions" + }, + { + "type": "doc", + "id": "version-2.8.x/admin-api-packages" + } + ] + }, + { + "type": "category", + "label": "Adaptors", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/adaptors-kafka" + }, + { + "type": "doc", + "id": "version-2.8.x/adaptors-spark" + }, + { + "type": "doc", + "id": "version-2.8.x/adaptors-storm" + } + ] + }, + { + "type": "category", + "label": "Cookbooks", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/cookbooks-compaction" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-deduplication" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-non-persistent" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-retention-expiry" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-encryption" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-message-queue" + }, + { + "type": "doc", + "id": "version-2.8.x/cookbooks-bookkeepermetadata" + } + ] + }, + { + "type": "category", + "label": "Development", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/develop-tools" + }, + { + "type": "doc", + "id": "version-2.8.x/developing-binary-protocol" + }, + { + "type": "doc", + "id": "version-2.8.x/develop-schema" + }, + { + "type": "doc", + "id": "version-2.8.x/develop-load-manager" + } + ] + }, + { + "type": "category", + "label": "Reference", + "items": [ + { + "type": "doc", + "id": "version-2.8.x/reference-terminology" + }, + { + "type": "doc", + "id": "version-2.8.x/reference-cli-tools" + }, + { + "type": "doc", + "id": "version-2.8.x/reference-configuration" + }, + { + "type": "doc", + "id": "version-2.8.x/reference-metrics" + }, + { + "type": "doc", + "id": "version-2.8.x/reference-rest-api-overview" + } + ] + } + ] +} \ No newline at end of file diff --git a/site2/website/versioned_sidebars/version-2.9.x-sidebars.json b/site2/website/versioned_sidebars/version-2.9.x-sidebars.json new file mode 100644 index 0000000000000..14041a83ae944 --- /dev/null +++ b/site2/website/versioned_sidebars/version-2.9.x-sidebars.json @@ -0,0 +1,598 @@ +{ + "version-2.9.x/docsSidebar": [ + { + "type": "doc", + "id": "version-2.9.x/about" + }, + { + "type": "category", + "label": "Get Started", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/getting-started-standalone" + }, + { + "type": "doc", + "id": "version-2.9.x/getting-started-docker" + }, + { + "type": "doc", + "id": "version-2.9.x/getting-started-helm" + } + ] + }, + { + "type": "category", + "label": "Concepts and Architecture", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/concepts-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-messaging" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-architecture-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-clients" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-replication" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-multi-tenancy" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-authentication" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-topic-compaction" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-proxy-sni-routing" + }, + { + "type": "doc", + "id": "version-2.9.x/concepts-multiple-advertised-listeners" + } + ] + }, + { + "type": "category", + "label": "Pulsar Schema", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/schema-get-started" + }, + { + "type": "doc", + "id": "version-2.9.x/schema-understand" + }, + { + "type": "doc", + "id": "version-2.9.x/schema-evolution-compatibility" + }, + { + "type": "doc", + "id": "version-2.9.x/schema-manage" + } + ] + }, + { + "type": "category", + "label": "Pulsar Functions", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/functions-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-runtime" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-worker" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-develop" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-package" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-debug" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-deploy" + }, + { + "type": "doc", + "id": "version-2.9.x/functions-cli" + }, + { + "type": "doc", + "id": "version-2.9.x/window-functions-context" + } + ] + }, + { + "type": "category", + "label": "Pulsar IO", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/io-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/io-quickstart" + }, + { + "type": "doc", + "id": "version-2.9.x/io-use" + }, + { + "type": "doc", + "id": "version-2.9.x/io-debug" + }, + { + "type": "doc", + "id": "version-2.9.x/io-connectors" + }, + { + "type": "doc", + "id": "version-2.9.x/io-cdc" + }, + { + "type": "doc", + "id": "version-2.9.x/io-develop" + }, + { + "type": "doc", + "id": "version-2.9.x/io-cli" + } + ] + }, + { + "type": "category", + "label": "Pulsar SQL", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/sql-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/sql-getting-started" + }, + { + "type": "doc", + "id": "version-2.9.x/sql-deployment-configurations" + }, + { + "type": "doc", + "id": "version-2.9.x/sql-rest-api" + } + ] + }, + { + "type": "category", + "label": "Tiered Storage", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-aws" + }, + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-gcs" + }, + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-filesystem" + }, + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-azure" + }, + { + "type": "doc", + "id": "version-2.9.x/tiered-storage-aliyun" + } + ] + }, + { + "type": "category", + "label": "Transactions", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/txn-why" + }, + { + "type": "doc", + "id": "version-2.9.x/txn-what" + }, + { + "type": "doc", + "id": "version-2.9.x/txn-how" + }, + { + "type": "doc", + "id": "version-2.9.x/txn-use" + }, + { + "type": "doc", + "id": "version-2.9.x/txn-monitor" + } + ] + }, + { + "type": "category", + "label": "Kubernetes (Helm)", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/helm-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/helm-prepare" + }, + { + "type": "doc", + "id": "version-2.9.x/helm-install" + }, + { + "type": "doc", + "id": "version-2.9.x/helm-deploy" + }, + { + "type": "doc", + "id": "version-2.9.x/helm-upgrade" + }, + { + "type": "doc", + "id": "version-2.9.x/helm-tools" + } + ] + }, + { + "type": "category", + "label": "Deployment", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/deploy-aws" + }, + { + "type": "doc", + "id": "version-2.9.x/deploy-kubernetes" + }, + { + "type": "doc", + "id": "version-2.9.x/deploy-bare-metal" + }, + { + "type": "doc", + "id": "version-2.9.x/deploy-bare-metal-multi-cluster" + }, + { + "type": "doc", + "id": "version-2.9.x/deploy-docker" + }, + { + "type": "doc", + "id": "version-2.9.x/deploy-monitoring" + } + ] + }, + { + "type": "category", + "label": "Administration", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/administration-zk-bk" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-geo" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-pulsar-manager" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-stats" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-load-balance" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-proxy" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-upgrade" + }, + { + "type": "doc", + "id": "version-2.9.x/administration-isolation" + } + ] + }, + { + "type": "category", + "label": "Security", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/security-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/security-tls-transport" + }, + { + "type": "doc", + "id": "version-2.9.x/security-tls-authentication" + }, + { + "type": "doc", + "id": "version-2.9.x/security-tls-keystore" + }, + { + "type": "doc", + "id": "version-2.9.x/security-jwt" + }, + { + "type": "doc", + "id": "version-2.9.x/security-athenz" + }, + { + "type": "doc", + "id": "version-2.9.x/security-kerberos" + }, + { + "type": "doc", + "id": "version-2.9.x/security-oauth2" + }, + { + "type": "doc", + "id": "version-2.9.x/security-basic-auth" + }, + { + "type": "doc", + "id": "version-2.9.x/security-authorization" + }, + { + "type": "doc", + "id": "version-2.9.x/security-encryption" + }, + { + "type": "doc", + "id": "version-2.9.x/security-extending" + }, + { + "type": "doc", + "id": "version-2.9.x/security-bouncy-castle" + } + ] + }, + { + "type": "category", + "label": "Performance", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/performance-pulsar-perf" + } + ] + }, + { + "type": "category", + "label": "Client Libraries", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/client-libraries" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-java" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-go" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-python" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-cpp" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-node" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-websocket" + }, + { + "type": "doc", + "id": "version-2.9.x/client-libraries-dotnet" + } + ] + }, + { + "type": "category", + "label": "Admin API", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/admin-api-overview" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-clusters" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-tenants" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-brokers" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-namespaces" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-permissions" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-topics" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-functions" + }, + { + "type": "doc", + "id": "version-2.9.x/admin-api-packages" + } + ] + }, + { + "type": "category", + "label": "Adaptors", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/adaptors-kafka" + }, + { + "type": "doc", + "id": "version-2.9.x/adaptors-spark" + }, + { + "type": "doc", + "id": "version-2.9.x/adaptors-storm" + } + ] + }, + { + "type": "category", + "label": "Cookbooks", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/cookbooks-compaction" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-deduplication" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-non-persistent" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-retention-expiry" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-encryption" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-message-queue" + }, + { + "type": "doc", + "id": "version-2.9.x/cookbooks-bookkeepermetadata" + } + ] + }, + { + "type": "category", + "label": "Development", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/develop-tools" + }, + { + "type": "doc", + "id": "version-2.9.x/developing-binary-protocol" + }, + { + "type": "doc", + "id": "version-2.9.x/develop-schema" + }, + { + "type": "doc", + "id": "version-2.9.x/develop-load-manager" + } + ] + }, + { + "type": "category", + "label": "Reference", + "items": [ + { + "type": "doc", + "id": "version-2.9.x/reference-terminology" + }, + { + "type": "doc", + "id": "version-2.9.x/reference-cli-tools" + }, + { + "type": "doc", + "id": "version-2.9.x/reference-configuration" + }, + { + "type": "doc", + "id": "version-2.9.x/reference-metrics" + }, + { + "type": "doc", + "id": "version-2.9.x/reference-rest-api-overview" + } + ] + } + ] +} \ No newline at end of file diff --git a/site2/website/versions.json b/site2/website/versions.json index 2484e5d0a80c1..5abf7d0247905 100644 --- a/site2/website/versions.json +++ b/site2/website/versions.json @@ -1,14 +1,7 @@ [ - "2.10.1", - "2.10.0", - "2.9.3", - "2.9.2", - "2.9.1", - "2.9.0", - "2.8.3", - "2.8.2", - "2.8.1", - "2.8.0", + "2.10.x", + "2.9.x", + "2.8.x", "2.7.4", "2.7.3", "2.7.2",