From e353c9b5dbf7efacd6aac36179a5a89d3b47ec5c Mon Sep 17 00:00:00 2001 From: woile Date: Wed, 27 Nov 2024 13:34:17 +0000 Subject: [PATCH] =?UTF-8?q?Deploy=20preview=20for=20PR=20248=20?= =?UTF-8?q?=F0=9F=9B=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pr-preview/pr-248/middleware/index.html | 56 ++++++++++++---------- pr-preview/pr-248/search/search_index.json | 2 +- pr-preview/pr-248/stream/index.html | 10 ++-- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/pr-preview/pr-248/middleware/index.html b/pr-preview/pr-248/middleware/index.html index fffeee98..94f48899 100644 --- a/pr-preview/pr-248/middleware/index.html +++ b/pr-preview/pr-248/middleware/index.html @@ -816,7 +816,15 @@

Middleware

25 26 27 -28
class MiddlewareProtocol(typing.Protocol):
+28
+29
+30
+31
+32
class MiddlewareProtocol(typing.Protocol):
+    next_call: types.NextMiddlewareCall
+    send: types.Send
+    stream: "Stream"
+
     def __init__(
         self,
         *,
@@ -946,15 +954,7 @@ 

Default Middleware

Source code in kstreams/middleware/middleware.py -
 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
+                
 71
  72
  73
  74
@@ -1030,7 +1030,15 @@ 

Default Middleware

144 145 146 -147
class ExceptionMiddleware(BaseMiddleware):
+147
+148
+149
+150
+151
+152
+153
+154
+155
class ExceptionMiddleware(BaseMiddleware):
     """
     This is always the first Middleware in the middleware stack
     to catch any exception that might occur. Any exception raised
@@ -1062,7 +1070,7 @@ 

Default Middleware

async def cleanup_policy(self, exc: Exception) -> None: """ - Execute clenup policicy according to the Stream configuration. + Execute cleanup policy according to the Stream configuration. At this point we are inside the asyncio.Lock `is_processing` as an event is being processed and an exeption has occured. @@ -1145,7 +1153,7 @@

-

Execute clenup policicy according to the Stream configuration.

+

Execute cleanup policy according to the Stream configuration.

At this point we are inside the asyncio.Lock is_processing as an event is being processed and an exeption has occured. The Lock must be released to stop the Stream @@ -1224,15 +1232,7 @@

Source code in kstreams/middleware/middleware.py -
 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
+              
101
 102
 103
 104
@@ -1278,9 +1278,17 @@ 

144 145 146 -147

async def cleanup_policy(self, exc: Exception) -> None:
+147
+148
+149
+150
+151
+152
+153
+154
+155
async def cleanup_policy(self, exc: Exception) -> None:
     """
-    Execute clenup policicy according to the Stream configuration.
+    Execute cleanup policy according to the Stream configuration.
 
     At this point we are inside the asyncio.Lock `is_processing`
     as an event is being processed and an exeption has occured.
diff --git a/pr-preview/pr-248/search/search_index.json b/pr-preview/pr-248/search/search_index.json
index 1f10bbfd..8e628fd7 100644
--- a/pr-preview/pr-248/search/search_index.json
+++ b/pr-preview/pr-248/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Kstreams","text":"

kstreams is a library/micro framework to use with kafka. It has simple kafka streams implementation that gives certain guarantees, see below.

"},{"location":"#requirements","title":"Requirements","text":"

python 3.8+

"},{"location":"#installation","title":"Installation","text":"
pip install kstreams\n

You will need a worker, we recommend aiorun

pip install aiorun\n
"},{"location":"#usage","title":"Usage","text":"
import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n@stream_engine.stream(\"local--kstream\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--kstreams\", value=payload)\n        print(f\"Message sent: {metadata}\")\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n
"},{"location":"#kafka-configuration","title":"Kafka configuration","text":"

Configure kafka using the kafka backend provided.

"},{"location":"#development","title":"Development","text":"

This repo requires the use of poetry instead of pip. Note: If you want to have the virtualenv in the same path as the project first you should run poetry config --local virtualenvs.in-project true

To install the dependencies just execute:

poetry install\n

Then you can activate the virtualenv with

poetry shell\n

Run test:

./scripts/test\n

Run code linting (black and isort)

./scripts/lint\n
"},{"location":"#commit-messages","title":"Commit messages","text":"

The use of commitizen is recommended. Commitizen is part of the dev dependencies.

cz commit\n
"},{"location":"backends/","title":"Backends","text":"

The main idea of a backend is to supply the necessary configuration to create a connection with the backend.

kstreams currently has support for Kafka as a backend.

"},{"location":"backends/#kstreams.backends.kafka.Kafka","title":"kstreams.backends.kafka.Kafka","text":"

The Kafka backend validates the given attributes.

It uses pydantic internally.

Attributes:

Name Type Description bootstrap_servers List[str]

kafka list of hostname:port

security_protocol SecurityProtocol

Protocol used to communicate with brokers

ssl_context Optional[SSLContext]

a python std ssl.SSLContext instance, you can generate it with create_ssl_context or create_ssl_context_from_mem

sasl_mechanism SaslMechanism

Authentication mechanism when security_protocol is configured for SASL_PLAINTEXT or SASL_SSL

sasl_plain_username Optional[str]

username for sasl PLAIN authentication

sasl_plain_password Optional[str]

password for sasl PLAIN authentication

sasl_oauth_token_provider Optional[str]

smth

Raises:

Type Description ValidationError

a pydantic.ValidationError exception

"},{"location":"backends/#kstreams.backends.kafka.Kafka--plaintext","title":"PLAINTEXT","text":"

Example

from kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, Stream\n\nbackend = Kafka(bootstrap_servers=[\"localhost:9092\"])\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n
"},{"location":"backends/#kstreams.backends.kafka.Kafka--ssl","title":"SSL","text":"

Example

Create SSL context
import ssl\n\nfrom kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, utils, Stream\n\n\ndef get_ssl_context() -> ssl.SSLContext:\n    return utils.create_ssl_context(\n        cafile=\"certificate-authority-file-path\",\n        capath=\"points-to-directory-with-several-ca-certificates\",\n        cadata=\"same-as-cafile-but-ASCII-or-bytes-format\",\n        certfile=\"client-certificate-file-name\",\n        keyfile=\"client-private-key-file-name\",\n        password=\"password-to-load-certificate-chain\",\n    )\n\nbackend = Kafka(\n    bootstrap_servers=[\"localhost:9094\"],\n    security_protocol=\"SSL\",\n    ssl_context=get_ssl_context(),\n)\n\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n

Note

Check create ssl context util

Example

Create SSL context from memory
import ssl\n\nfrom kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, utils, Stream\n\n\ndef get_ssl_context() -> ssl.SSLContext:\n    return utils.create_ssl_context_from_mem(\n        cadata=\"ca-certificates-as-unicode\",\n        certdata=\"client-certificate-as-unicode\",\n        keydata=\"client-private-key-as-unicode\",\n        password=\"optional-password-to-load-certificate-chain\",\n    )\n\nbackend = Kafka(\n    bootstrap_servers=[\"localhost:9094\"],\n    security_protocol=\"SSL\",\n    ssl_context=get_ssl_context(),\n)\n\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n

Note

Check create ssl context from memerory util

Source code in kstreams/backends/kafka.py
class Kafka(BaseModel):\n    \"\"\"\n    The `Kafka` backend validates the given attributes.\n\n    It uses pydantic internally.\n\n    Attributes:\n        bootstrap_servers: kafka list of `hostname:port`\n        security_protocol: Protocol used to communicate with brokers\n        ssl_context: a python std `ssl.SSLContext` instance, you can generate\n            it with `create_ssl_context`\n            or `create_ssl_context_from_mem`\n        sasl_mechanism: Authentication mechanism when `security_protocol` is configured\n            for `SASL_PLAINTEXT` or `SASL_SSL`\n        sasl_plain_username: username for sasl PLAIN authentication\n        sasl_plain_password: password for sasl PLAIN authentication\n        sasl_oauth_token_provider: smth\n\n    Raises:\n        ValidationError: a `pydantic.ValidationError` exception\n\n    ## PLAINTEXT\n\n    !!! Example\n        ```python\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, Stream\n\n        backend = Kafka(bootstrap_servers=[\"localhost:9092\"])\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n    ## SSL\n\n    !!! Example\n        ```python title=\"Create SSL context\"\n        import ssl\n\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, utils, Stream\n\n\n        def get_ssl_context() -> ssl.SSLContext:\n            return utils.create_ssl_context(\n                cafile=\"certificate-authority-file-path\",\n                capath=\"points-to-directory-with-several-ca-certificates\",\n                cadata=\"same-as-cafile-but-ASCII-or-bytes-format\",\n                certfile=\"client-certificate-file-name\",\n                keyfile=\"client-private-key-file-name\",\n                password=\"password-to-load-certificate-chain\",\n            )\n\n        backend = Kafka(\n            bootstrap_servers=[\"localhost:9094\"],\n            security_protocol=\"SSL\",\n            ssl_context=get_ssl_context(),\n        )\n\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n        !!! note\n            Check [create ssl context util](https://kpn.github.io/kstreams/utils/#kstreams.utils.create_ssl_context)\n\n    !!! Example\n        ```python title=\"Create SSL context from memory\"\n        import ssl\n\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, utils, Stream\n\n\n        def get_ssl_context() -> ssl.SSLContext:\n            return utils.create_ssl_context_from_mem(\n                cadata=\"ca-certificates-as-unicode\",\n                certdata=\"client-certificate-as-unicode\",\n                keydata=\"client-private-key-as-unicode\",\n                password=\"optional-password-to-load-certificate-chain\",\n            )\n\n        backend = Kafka(\n            bootstrap_servers=[\"localhost:9094\"],\n            security_protocol=\"SSL\",\n            ssl_context=get_ssl_context(),\n        )\n\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n        !!! note\n            Check [create ssl context from memerory util](https://kpn.github.io/kstreams/utils/#kstreams.utils.create_ssl_context_from_mem)\n    \"\"\"\n\n    bootstrap_servers: List[str] = [\"localhost:9092\"]\n    security_protocol: SecurityProtocol = SecurityProtocol.PLAINTEXT\n\n    ssl_context: Optional[ssl.SSLContext] = None\n\n    sasl_mechanism: SaslMechanism = SaslMechanism.PLAIN\n    sasl_plain_username: Optional[str] = None\n    sasl_plain_password: Optional[str] = None\n    sasl_oauth_token_provider: Optional[str] = None\n    model_config = ConfigDict(arbitrary_types_allowed=True, use_enum_values=True)\n\n    @model_validator(mode=\"after\")\n    @classmethod\n    def protocols_validation(cls, values):\n        security_protocol = values.security_protocol\n\n        if security_protocol == SecurityProtocol.PLAINTEXT:\n            return values\n        elif security_protocol == SecurityProtocol.SSL:\n            if values.ssl_context is None:\n                raise ValueError(\"`ssl_context` is required\")\n            return values\n        elif security_protocol == SecurityProtocol.SASL_PLAINTEXT:\n            if values.sasl_mechanism is SaslMechanism.OAUTHBEARER:\n                # We don't perform a username and password check if OAUTHBEARER\n                return values\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_username is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_username` is required when using SASL_PLAIN\"\n                )\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_password is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_password` is required when using SASL_PLAIN\"\n                )\n            return values\n        elif security_protocol == SecurityProtocol.SASL_SSL:\n            if values.ssl_context is None:\n                raise ValueError(\"`ssl_context` is required\")\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_username is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_username` is required when using SASL_PLAIN\"\n                )\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_password is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_password` is required when using SASL_PLAIN\"\n                )\n            return values\n
"},{"location":"engine/","title":"StreamEngine","text":""},{"location":"engine/#kstreams.engine.StreamEngine","title":"kstreams.engine.StreamEngine","text":"

Attributes:

Name Type Description backend Kafka

Backend to connect. Default Kafka

consumer_class Consumer

The consumer class to use when instanciate a consumer. Default kstreams.Consumer

producer_class Producer

The producer class to use when instanciate the producer. Default kstreams.Producer

monitor PrometheusMonitor

Prometheus monitor that holds the metrics

title str | None

Engine name

serializer Serializer | None

Serializer to use when an event is produced.

deserializer Deserializer | None

Deserializer to be used when an event is consumed. If provided it will be used in all Streams instances as a general one. To override it per Stream, you can provide one per Stream

Example

Usage
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@kstreams.stream(\"local--hello-world\", group_id=\"example-group\")\nasync def consume(stream: kstreams.ConsumerRecord) -> None:\n    print(f\"showing bytes: {cr.value}\")\n\n\nawait stream_engine.start()\n
Source code in kstreams/engine.py
class StreamEngine:\n    \"\"\"\n    Attributes:\n        backend kstreams.backends.Kafka: Backend to connect. Default `Kafka`\n        consumer_class kstreams.Consumer: The consumer class to use when\n            instanciate a consumer. Default kstreams.Consumer\n        producer_class kstreams.Producer: The producer class to use when\n            instanciate the producer. Default kstreams.Producer\n        monitor kstreams.PrometheusMonitor: Prometheus monitor that holds\n            the [metrics](https://kpn.github.io/kstreams/metrics/)\n        title str | None: Engine name\n        serializer kstreams.serializers.Serializer | None: Serializer to\n            use when an event is produced.\n        deserializer kstreams.serializers.Deserializer | None: Deserializer\n            to be used when an event is consumed.\n            If provided it will be used in all Streams instances as a general one.\n            To override it per Stream, you can provide one per Stream\n\n    !!! Example\n        ```python title=\"Usage\"\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @kstreams.stream(\"local--hello-world\", group_id=\"example-group\")\n        async def consume(stream: kstreams.ConsumerRecord) -> None:\n            print(f\"showing bytes: {cr.value}\")\n\n\n        await stream_engine.start()\n        ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        backend: Kafka,\n        consumer_class: typing.Type[Consumer],\n        producer_class: typing.Type[Producer],\n        monitor: PrometheusMonitor,\n        title: typing.Optional[str] = None,\n        deserializer: Deprecated[typing.Optional[Deserializer]] = None,\n        serializer: typing.Optional[Serializer] = None,\n        on_startup: typing.Optional[EngineHooks] = None,\n        on_stop: typing.Optional[EngineHooks] = None,\n        after_startup: typing.Optional[EngineHooks] = None,\n        after_stop: typing.Optional[EngineHooks] = None,\n    ) -> None:\n        self.title = title\n        self.backend = backend\n        self.consumer_class = consumer_class\n        self.producer_class = producer_class\n        self.deserializer = deserializer\n        self.serializer = serializer\n        self.monitor = monitor\n        self._producer: typing.Optional[typing.Type[Producer]] = None\n        self._streams: typing.List[Stream] = []\n        self._on_startup = [] if on_startup is None else list(on_startup)\n        self._on_stop = [] if on_stop is None else list(on_stop)\n        self._after_startup = [] if after_startup is None else list(after_startup)\n        self._after_stop = [] if after_stop is None else list(after_stop)\n\n    async def send(\n        self,\n        topic: str,\n        value: typing.Any = None,\n        key: typing.Any = None,\n        partition: typing.Optional[int] = None,\n        timestamp_ms: typing.Optional[int] = None,\n        headers: typing.Optional[Headers] = None,\n        serializer: typing.Optional[Serializer] = None,\n        serializer_kwargs: typing.Optional[typing.Dict] = None,\n    ):\n        \"\"\"\n        Attributes:\n            topic str: Topic name to send the event to\n            value Any: Event value\n            key str | None: Event key\n            partition int | None: Topic partition\n            timestamp_ms int | None: Event timestamp in miliseconds\n            headers Dict[str, str] | None: Event headers\n            serializer kstreams.serializers.Serializer | None: Serializer to\n                encode the event\n            serializer_kwargs Dict[str, Any] | None: Serializer kwargs\n        \"\"\"\n        if self._producer is None:\n            raise EngineNotStartedException()\n\n        serializer = serializer or self.serializer\n\n        # serialize only when value and serializer are present\n        if value is not None and serializer is not None:\n            value = await serializer.serialize(\n                value, headers=headers, serializer_kwargs=serializer_kwargs\n            )\n\n        encoded_headers = None\n        if headers is not None:\n            encoded_headers = encode_headers(headers)\n\n        fut = await self._producer.send(\n            topic,\n            value=value,\n            key=key,\n            partition=partition,\n            timestamp_ms=timestamp_ms,\n            headers=encoded_headers,\n        )\n        metadata: RecordMetadata = await fut\n        self.monitor.add_topic_partition_offset(\n            topic, metadata.partition, metadata.offset\n        )\n\n        return metadata\n\n    async def start(self) -> None:\n        # Execute on_startup hooks\n        await execute_hooks(self._on_startup)\n\n        # add the producer and streams to the Monitor\n        self.monitor.add_producer(self._producer)\n        self.monitor.add_streams(self._streams)\n\n        await self.start_producer()\n        await self.start_streams()\n\n        # Execute after_startup hooks\n        await execute_hooks(self._after_startup)\n\n    def on_startup(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run before the engine starts.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable before engine starts\n\n        !!! Example\n            ```python title=\"Engine before startup\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.on_startup\n            async def init_db() -> None:\n                print(\"Initializing Database Connections\")\n                await init_db()\n\n\n            @stream_engine.on_startup\n            async def start_background_task() -> None:\n                print(\"Some background task\")\n            ```\n        \"\"\"\n        self._on_startup.append(func)\n        return func\n\n    def on_stop(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run before the engine stops.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable before engine stops\n\n        !!! Example\n            ```python title=\"Engine before stops\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.on_stop\n            async def close_db() -> None:\n                print(\"Closing Database Connections\")\n                await db_close()\n            ```\n        \"\"\"\n        self._on_stop.append(func)\n        return func\n\n    def after_startup(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run after the engine starts.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable after engine starts\n\n        !!! Example\n            ```python title=\"Engine after startup\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.after_startup\n            async def after_startup() -> None:\n                print(\"Set pod as healthy\")\n                await mark_healthy_pod()\n            ```\n        \"\"\"\n        self._after_startup.append(func)\n        return func\n\n    def after_stop(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run after the engine stops.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable after engine stops\n\n        !!! Example\n            ```python title=\"Engine after stops\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.after_stop\n            async def after_stop() -> None:\n                print(\"Finishing backgrpund tasks\")\n            ```\n        \"\"\"\n        self._after_stop.append(func)\n        return func\n\n    async def stop(self) -> None:\n        # Execute on_startup hooks\n        await execute_hooks(self._on_stop)\n\n        await self.monitor.stop()\n        await self.stop_producer()\n        await self.stop_streams()\n\n        # Execute after_startup hooks\n        await execute_hooks(self._after_stop)\n\n    async def stop_producer(self):\n        if self._producer is not None:\n            await self._producer.stop()\n        logger.info(\"Producer has STOPPED....\")\n\n    async def start_producer(self, **kwargs) -> None:\n        if self.producer_class is None:\n            return None\n        config = {**self.backend.model_dump(), **kwargs}\n        self._producer = self.producer_class(**config)\n        if self._producer is None:\n            return None\n        await self._producer.start()\n\n    async def start_streams(self) -> None:\n        # Only start the Streams that are not async_generators\n        streams = [\n            stream\n            for stream in self._streams\n            if not inspect.isasyncgenfunction(stream.func)\n        ]\n\n        await self._start_streams_on_background_mode(streams)\n\n    async def _start_streams_on_background_mode(\n        self, streams: typing.List[Stream]\n    ) -> None:\n        # start all the streams\n        for stream in streams:\n            asyncio.create_task(stream.start())\n\n        # start monitoring\n        asyncio.create_task(self.monitor.start())\n\n    async def stop_streams(self) -> None:\n        for stream in self._streams:\n            await stream.stop()\n        logger.info(\"Streams have STOPPED....\")\n\n    async def clean_streams(self):\n        await self.stop_streams()\n        self._streams = []\n\n    def exist_stream(self, name: str) -> bool:\n        stream = self.get_stream(name)\n        return True if stream is not None else False\n\n    def get_stream(self, name: str) -> typing.Optional[Stream]:\n        stream = next((stream for stream in self._streams if stream.name == name), None)\n\n        return stream\n\n    def add_stream(\n        self, stream: Stream, error_policy: typing.Optional[StreamErrorPolicy] = None\n    ) -> None:\n        \"\"\"\n        Add a stream to the engine.\n\n        This method registers a new stream with the engine, setting up necessary\n        configurations and handlers. If a stream with the same name already exists,\n        a DuplicateStreamException is raised.\n\n        Args:\n            stream: The stream to be added.\n            error_policy: An optional error policy to be applied to the stream.\n                You should probably set directly when instanciating a Stream, not here.\n\n        Raises:\n            DuplicateStreamException: If a stream with the same name already exists.\n\n        Notes:\n            - If the stream does not have a deserializer, the engine's deserializer\n              is assigned to it.\n            - If the stream does not have a rebalance listener, a default\n              MetricsRebalanceListener is assigned.\n            - The stream's UDF handler is set up with the provided function and\n              engine's send method.\n            - If the stream's UDF handler type is not NO_TYPING, a middleware stack\n              is built for the stream's function.\n        \"\"\"\n        if self.exist_stream(stream.name):\n            raise DuplicateStreamException(name=stream.name)\n\n        if error_policy is not None:\n            stream.error_policy = error_policy\n\n        stream.backend = self.backend\n        if stream.deserializer is None:\n            stream.deserializer = self.deserializer\n        self._streams.append(stream)\n\n        if stream.rebalance_listener is None:\n            # set the stream to the listener to it will be available\n            # when the callbacks are called\n            stream.rebalance_listener = MetricsRebalanceListener()\n\n        stream.rebalance_listener.stream = stream\n        stream.rebalance_listener.engine = self\n\n        stream.udf_handler = UdfHandler(\n            next_call=stream.func,\n            send=self.send,\n            stream=stream,\n        )\n\n        # NOTE: When `no typing` support is deprecated this check can\n        # be removed\n        if stream.udf_handler.type != UDFType.NO_TYPING:\n            stream.func = self._build_stream_middleware_stack(stream=stream)\n\n    def _build_stream_middleware_stack(self, *, stream: Stream) -> NextMiddlewareCall:\n        assert stream.udf_handler, \"UdfHandler can not be None\"\n\n        middlewares = stream.get_middlewares(self)\n        next_call = stream.udf_handler\n        for middleware, options in reversed(middlewares):\n            next_call = middleware(\n                next_call=next_call, send=self.send, stream=stream, **options\n            )\n        return next_call\n\n    async def remove_stream(self, stream: Stream) -> None:\n        consumer = stream.consumer\n        self._streams.remove(stream)\n        await stream.stop()\n\n        if consumer is not None:\n            self.monitor.clean_stream_consumer_metrics(consumer=consumer)\n\n    def stream(\n        self,\n        topics: typing.Union[typing.List[str], str],\n        *,\n        name: typing.Optional[str] = None,\n        deserializer: Deprecated[typing.Optional[Deserializer]] = None,\n        initial_offsets: typing.Optional[typing.List[TopicPartitionOffset]] = None,\n        rebalance_listener: typing.Optional[RebalanceListener] = None,\n        middlewares: typing.Optional[typing.List[Middleware]] = None,\n        subscribe_by_pattern: bool = False,\n        error_policy: StreamErrorPolicy = StreamErrorPolicy.STOP,\n        **kwargs,\n    ) -> typing.Callable[[StreamFunc], Stream]:\n        def decorator(func: StreamFunc) -> Stream:\n            stream_from_func = stream_func(\n                topics,\n                name=name,\n                deserializer=deserializer,\n                initial_offsets=initial_offsets,\n                rebalance_listener=rebalance_listener,\n                middlewares=middlewares,\n                subscribe_by_pattern=subscribe_by_pattern,\n                **kwargs,\n            )(func)\n            self.add_stream(stream_from_func, error_policy=error_policy)\n\n            return stream_from_func\n\n        return decorator\n
"},{"location":"engine/#kstreams.engine.StreamEngine.send","title":"send(topic, value=None, key=None, partition=None, timestamp_ms=None, headers=None, serializer=None, serializer_kwargs=None) async","text":"

Attributes:

Name Type Description topic str

Topic name to send the event to

value Any

Event value

key str | None

Event key

partition int | None

Topic partition

timestamp_ms int | None

Event timestamp in miliseconds

headers Dict[str, str] | None

Event headers

serializer Serializer | None

Serializer to encode the event

serializer_kwargs Dict[str, Any] | None

Serializer kwargs

Source code in kstreams/engine.py
async def send(\n    self,\n    topic: str,\n    value: typing.Any = None,\n    key: typing.Any = None,\n    partition: typing.Optional[int] = None,\n    timestamp_ms: typing.Optional[int] = None,\n    headers: typing.Optional[Headers] = None,\n    serializer: typing.Optional[Serializer] = None,\n    serializer_kwargs: typing.Optional[typing.Dict] = None,\n):\n    \"\"\"\n    Attributes:\n        topic str: Topic name to send the event to\n        value Any: Event value\n        key str | None: Event key\n        partition int | None: Topic partition\n        timestamp_ms int | None: Event timestamp in miliseconds\n        headers Dict[str, str] | None: Event headers\n        serializer kstreams.serializers.Serializer | None: Serializer to\n            encode the event\n        serializer_kwargs Dict[str, Any] | None: Serializer kwargs\n    \"\"\"\n    if self._producer is None:\n        raise EngineNotStartedException()\n\n    serializer = serializer or self.serializer\n\n    # serialize only when value and serializer are present\n    if value is not None and serializer is not None:\n        value = await serializer.serialize(\n            value, headers=headers, serializer_kwargs=serializer_kwargs\n        )\n\n    encoded_headers = None\n    if headers is not None:\n        encoded_headers = encode_headers(headers)\n\n    fut = await self._producer.send(\n        topic,\n        value=value,\n        key=key,\n        partition=partition,\n        timestamp_ms=timestamp_ms,\n        headers=encoded_headers,\n    )\n    metadata: RecordMetadata = await fut\n    self.monitor.add_topic_partition_offset(\n        topic, metadata.partition, metadata.offset\n    )\n\n    return metadata\n
"},{"location":"engine/#kstreams.engine.StreamEngine.on_startup","title":"on_startup(func)","text":"

A list of callables to run before the engine starts. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable before engine starts

Example

Engine before startup
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.on_startup\nasync def init_db() -> None:\n    print(\"Initializing Database Connections\")\n    await init_db()\n\n\n@stream_engine.on_startup\nasync def start_background_task() -> None:\n    print(\"Some background task\")\n
Source code in kstreams/engine.py
def on_startup(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run before the engine starts.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable before engine starts\n\n    !!! Example\n        ```python title=\"Engine before startup\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.on_startup\n        async def init_db() -> None:\n            print(\"Initializing Database Connections\")\n            await init_db()\n\n\n        @stream_engine.on_startup\n        async def start_background_task() -> None:\n            print(\"Some background task\")\n        ```\n    \"\"\"\n    self._on_startup.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.on_stop","title":"on_stop(func)","text":"

A list of callables to run before the engine stops. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable before engine stops

Example

Engine before stops
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.on_stop\nasync def close_db() -> None:\n    print(\"Closing Database Connections\")\n    await db_close()\n
Source code in kstreams/engine.py
def on_stop(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run before the engine stops.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable before engine stops\n\n    !!! Example\n        ```python title=\"Engine before stops\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.on_stop\n        async def close_db() -> None:\n            print(\"Closing Database Connections\")\n            await db_close()\n        ```\n    \"\"\"\n    self._on_stop.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.after_startup","title":"after_startup(func)","text":"

A list of callables to run after the engine starts. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable after engine starts

Example

Engine after startup
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.after_startup\nasync def after_startup() -> None:\n    print(\"Set pod as healthy\")\n    await mark_healthy_pod()\n
Source code in kstreams/engine.py
def after_startup(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run after the engine starts.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable after engine starts\n\n    !!! Example\n        ```python title=\"Engine after startup\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.after_startup\n        async def after_startup() -> None:\n            print(\"Set pod as healthy\")\n            await mark_healthy_pod()\n        ```\n    \"\"\"\n    self._after_startup.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.after_stop","title":"after_stop(func)","text":"

A list of callables to run after the engine stops. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable after engine stops

Example

Engine after stops
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.after_stop\nasync def after_stop() -> None:\n    print(\"Finishing backgrpund tasks\")\n
Source code in kstreams/engine.py
def after_stop(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run after the engine stops.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable after engine stops\n\n    !!! Example\n        ```python title=\"Engine after stops\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.after_stop\n        async def after_stop() -> None:\n            print(\"Finishing backgrpund tasks\")\n        ```\n    \"\"\"\n    self._after_stop.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.add_stream","title":"add_stream(stream, error_policy=None)","text":"

Add a stream to the engine.

This method registers a new stream with the engine, setting up necessary configurations and handlers. If a stream with the same name already exists, a DuplicateStreamException is raised.

Parameters:

Name Type Description Default stream Stream

The stream to be added.

required error_policy Optional[StreamErrorPolicy]

An optional error policy to be applied to the stream. You should probably set directly when instanciating a Stream, not here.

None

Raises:

Type Description DuplicateStreamException

If a stream with the same name already exists.

Notes
  • If the stream does not have a deserializer, the engine's deserializer is assigned to it.
  • If the stream does not have a rebalance listener, a default MetricsRebalanceListener is assigned.
  • The stream's UDF handler is set up with the provided function and engine's send method.
  • If the stream's UDF handler type is not NO_TYPING, a middleware stack is built for the stream's function.
Source code in kstreams/engine.py
def add_stream(\n    self, stream: Stream, error_policy: typing.Optional[StreamErrorPolicy] = None\n) -> None:\n    \"\"\"\n    Add a stream to the engine.\n\n    This method registers a new stream with the engine, setting up necessary\n    configurations and handlers. If a stream with the same name already exists,\n    a DuplicateStreamException is raised.\n\n    Args:\n        stream: The stream to be added.\n        error_policy: An optional error policy to be applied to the stream.\n            You should probably set directly when instanciating a Stream, not here.\n\n    Raises:\n        DuplicateStreamException: If a stream with the same name already exists.\n\n    Notes:\n        - If the stream does not have a deserializer, the engine's deserializer\n          is assigned to it.\n        - If the stream does not have a rebalance listener, a default\n          MetricsRebalanceListener is assigned.\n        - The stream's UDF handler is set up with the provided function and\n          engine's send method.\n        - If the stream's UDF handler type is not NO_TYPING, a middleware stack\n          is built for the stream's function.\n    \"\"\"\n    if self.exist_stream(stream.name):\n        raise DuplicateStreamException(name=stream.name)\n\n    if error_policy is not None:\n        stream.error_policy = error_policy\n\n    stream.backend = self.backend\n    if stream.deserializer is None:\n        stream.deserializer = self.deserializer\n    self._streams.append(stream)\n\n    if stream.rebalance_listener is None:\n        # set the stream to the listener to it will be available\n        # when the callbacks are called\n        stream.rebalance_listener = MetricsRebalanceListener()\n\n    stream.rebalance_listener.stream = stream\n    stream.rebalance_listener.engine = self\n\n    stream.udf_handler = UdfHandler(\n        next_call=stream.func,\n        send=self.send,\n        stream=stream,\n    )\n\n    # NOTE: When `no typing` support is deprecated this check can\n    # be removed\n    if stream.udf_handler.type != UDFType.NO_TYPING:\n        stream.func = self._build_stream_middleware_stack(stream=stream)\n
"},{"location":"getting_started/","title":"Getting Started","text":"

You can starting using kstreams with simple producers and consumers and/or integrated it with any async framework like FastAPI

"},{"location":"getting_started/#simple-consumer-and-producer","title":"Simple consumer and producer","text":"Simple use case
import asyncio\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--py-stream\", group_id=\"de-my-partition\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--py-streams\", value=payload, key=\"1\")\n        print(f\"Message sent: {metadata}\")\n        await asyncio.sleep(5)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown():\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    loop = asyncio.get_event_loop()\n    try:\n        loop.run_until_complete(start())\n        loop.run_forever()\n    finally:\n        loop.run_until_complete(shutdown())\n        loop.close()\n

(This script is complete, it should run \"as is\")

"},{"location":"getting_started/#recommended-usage","title":"Recommended usage","text":"

In the previous example you can see some boiler plate regarding how to start the program. We recommend to use aiorun, so you want have to worry about set signal handlers, shutdown callbacks, graceful shutdown and close the event loop.

Usage with aiorun
import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--py-stream\", group_id=\"de-my-partition\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--py-streams\", value=payload, key=\"1\")\n        print(f\"Message sent: {metadata}\")\n        await asyncio.sleep(5)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

(This script is complete, it should run \"as is\")

"},{"location":"getting_started/#fastapi","title":"FastAPI","text":"

The following code example shows how kstreams can be integrated with any async framework like FastAPI. The full example can be found here

First, we need to create an engine:

Create the StreamEngine
# streaming.engine.py\nfrom kstreams import create_engine\n\nstream_engine = create_engine(\n    title=\"my-stream-engine\",\n)\n

Define the streams:

Application stream
# streaming.streams.py\nfrom .engine import stream_engine\nfrom kstreams import ConsumerRecord\n\n\n@stream_engine.stream(\"local--kstream\")\nasync def stream(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.payload}\")\n

Create the FastAPI:

FastAPI
# app.py\nfrom fastapi import FastAPI\nfrom starlette.responses import Response\nfrom starlette_prometheus import PrometheusMiddleware, metrics\n\nfrom .streaming.streams import stream_engine\n\napp = FastAPI()\n\n@app.on_event(\"startup\")\nasync def startup_event():\n    await stream_engine.start()\n\n@app.on_event(\"shutdown\")\nasync def shutdown_event():\n    await stream_engine.stop()\n\n\n@app.get(\"/events\")\nasync def post_produce_event() -> Response:\n    payload = '{\"message\": \"hello world!\"}'\n\n    metadata = await stream_engine.send(\n        \"local--kstream\",\n        value=payload.encode(),\n    )\n    msg = (\n        f\"Produced event on topic: {metadata.topic}, \"\n        f\"part: {metadata.partition}, offset: {metadata.offset}\"\n    )\n\n    return Response(msg)\n\n\napp.add_middleware(PrometheusMiddleware, filter_unhandled_paths=True)\napp.add_api_route(\"/metrics\", metrics)\n
"},{"location":"getting_started/#changing-kafka-settings","title":"Changing Kafka settings","text":"

To modify the settings of a cluster, like the servers, refer to the backends docs

"},{"location":"large_project_structure/","title":"Large Projects","text":"

If you have a large project with maybe multiple streams we recommend the following project structure:

\u251c\u2500\u2500 my-project\n\u2502   \u251c\u2500\u2500 my_project\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 __init__.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 app.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 resources.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 streams.py\n\u2502   \u2502\u00a0\u00a0 \u2514\u2500\u2500 streams_roster.py\n\u2502   \u2502\u2500\u2500 tests\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py\n\u2502   \u2502   \u251c\u2500\u2500 conftest.py\n\u2502   \u2502\u2500\u2500 pyproject.toml\n\u2502   \u2502\u2500\u2500 README.md\n
  • The file my_project/resouces.py contains the creation of the StreamEngine
  • The file my_project/app.py contains the entrypoint of your program
  • The file my_project/streams.py contains all the Streams

A full project example ready to use can be found here

Note

This is just a recommendation, there are many ways to structure your project

"},{"location":"large_project_structure/#resources","title":"Resources","text":"

This python module contains any global resource that will be used later in the application, for example DB connections or the StreamEngine. Typically we will have the following:

from kstreams import backends, create_engine\n\nbackend = backends.Kafka(\n    bootstrap_servers=[\"localhost:9092\"],\n    security_protocol=backends.kafka.SecurityProtocol.PLAINTEXT,\n)\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\",\n    backend=backend,\n)\n

Then later stream_engine can be reused to start the application.

"},{"location":"large_project_structure/#streams","title":"Streams","text":"

When starting your project you can have N number of Streams with its handler, let's say in streams.py module. All of the Streams will run next to each other and because they are in the same project it is really easy to share common code. However, this comes with a downside of scalability as it is not possible to take the advantages of kafka and scale up Streams individually. In next versions the StreamEngine will be able to select which Stream/s should run to mitigate this issue. Typically, your streams.py will look like:

from kstreams import Stream\n\nfrom .streams_roster import stream_roster, stream_two_roster\n\n\nmy_stream = Stream(\n    \"local--hello-world\",\n    func=stream_roster,\n    config={\n        \"group_id\": \"example-group\",\n    },\n    ...\n)\n\nmy_second_stream = Stream(\n    \"local--hello-world-2\",\n    func=stream_two_roster,\n    config={\n        \"group_id\": \"example-group-2\",\n    },\n    ...\n)\n\n...\n

and streams_roster.py contains all the coroutines that will be executed when an event arrives

import logging\n\nfrom kstreams import ConsumerRecord, Send, Stream\n\nlogger = logging.getLogger(__name__)\n\n\nasync def stream_roster(cr: ConsumerRecord, send: Send) -> None:\n    logger.info(f\"showing bytes: {cr.value}\")\n    value = f\"Event confirmed. {cr.value}\"\n\n    await send(\n        \"another-topic-to-wink\",\n        value=value.encode(),\n        key=\"1\",\n    )\n\n\nasync def stream_two_roster(cr: ConsumerRecord, send: Send, stream: Stream) -> None:\n    ...\n

It is worth to note three things:

  • We separate the Stream with its coroutine to be able to test the business logic easily
  • If you need to produce events inside a Stream add the send coroutine using dependency-injection
  • We are not using StreamEngine at all to avoid circular import errors
"},{"location":"large_project_structure/#application","title":"Application","text":"

The entrypoint is usually in app.py. The module contains the import of stream_engine, it's hooks and the streams to be added to the engine:

import aiorun\nimport asyncio\nimport logging\n\nfrom kstreams.stream_utils import StreamErrorPolicy\n\nfrom .resources import stream_engine\nfrom .streams import my_stream, my_second_stream\n\nlogger = logging.getLogger(__name__)\n\n\n# hooks\n@stream_engine.after_startup\nasync def init_events():\n    await stream_engine.send(\"local--hello-world\", value=\"Hi Kstreams!\")\n\n\n# add the stream to the stream_engine\nstream_engine.add_stream(my_stream, error_policy=StreamErrorPolicy.RESTART)\nstream_engine.add_stream(my_second_stream, error_policy=StreamErrorPolicy.STOP_ENGINE)\n\n\nasync def start():\n    await stream_engine.start()\n\n\nasync def stop(loop: asyncio.AbstractEventLoop):\n    await stream_engine.stop()\n\n\ndef main():\n    logging.basicConfig(level=logging.INFO)\n    logger.info(\"Starting application...\")\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=stop)\n

To run it we recommend aiorun. It can be also run with asyncio directly but aiorun does all the boilerplate for us.

"},{"location":"large_project_structure/#tests","title":"Tests","text":"

In this module you test your application using the TestStreamClient, usually provided as a fixture thanks to pytest. The package pytest-asyncio is also needed to test async code.

# conftest.py\nimport pytest\n\nfrom kstreams.test_utils import TestStreamClient\n\nfrom my_project.resources import stream_engine\n\n\n@pytest.fixture\ndef stream_client():\n    return TestStreamClient(stream_engine=stream_engine)\n

then you can test your streams

# test_app.py\nimport pytest\n\n\n@pytest.mark.asyncio\nasync def test_my_stream(stream_client):\n    topic = \"local--hello-world\"  # Use the same topic as the stream\n    event = b'{\"message\": \"Hello world!\"}'\n\n    async with stream_client:\n        metadata = await stream_client.send(topic, value=event, key=\"1\")\n        assert metadata.topic == topic\n
"},{"location":"metrics/","title":"Metrics","text":"

Metrics are generated by prometheus_client. You must be responsable of setting up a webserver to expose the metrics.

"},{"location":"metrics/#metrics","title":"Metrics","text":""},{"location":"metrics/#producer","title":"Producer","text":"
  • topic_partition_offsets: Gauge of offsets per topic/partition
"},{"location":"metrics/#consumer","title":"Consumer","text":"
  • consumer_committed: Gauge of consumer commited per topic/partition in a consumer group
  • consumer_position: Gauge of consumer current position per topic/partition in a consumer group
  • consumer_highwater: Gauge of consumer highwater per topic/partition in a consumer group
  • consumer_lag: Gauge of current consumer lag per topic/partition in a consumer group calculated with the last commited offset
  • position_lag: Gauge of current consumer position_lag per topic/partition in a consumer group calculated using the consumer position
"},{"location":"middleware/","title":"Middleware","text":"

Kstreams allows you to include middlewares for adding behavior to streams.

A middleware is a callable that works with every ConsumerRecord (CR) before and after it is processed by a specific stream. Middlewares also have access to the stream and send function.

  • It takes each CR that arrives to a kafka topic.
  • Then it can do something to the CR or run any needed code.
  • Then it passes the CR to be processed by another callable (other middleware or stream).
  • Once the CR is processed by the stream, the chain is \"completed\".
  • If there is code after the self.next_call(cr) then it will be executed.

Kstreams Middleware have the following protocol:

Bases: Protocol

Source code in kstreams/middleware/middleware.py
class MiddlewareProtocol(typing.Protocol):\n    def __init__(\n        self,\n        *,\n        next_call: types.NextMiddlewareCall,\n        send: types.Send,\n        stream: \"Stream\",\n        **kwargs: typing.Any,\n    ) -> None: ...  #  pragma: no cover\n\n    async def __call__(\n        self, cr: types.ConsumerRecord\n    ) -> typing.Any: ...  #  pragma: no cover\n

Note

The __call__ method can return anything so previous calls can use the returned value. Make sure that the line return await self.next_call(cr) is in your method

Warning

Middlewares only work with the new Dependency Injection approach

"},{"location":"middleware/#creating-a-middleware","title":"Creating a middleware","text":"

To create a middleware you have to create a class that inherits from BaseMiddleware. Then, the method async def __call__ must be defined. Let's consider that we want to save the CR to elastic before it is processed:

import typing\n\nfrom kstreams import ConsumerRecord, middleware\n\nasync def save_to_elastic(cr: ConsumerRecord) -> None:\n    ...\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord) -> typing.Any:\n        # save to elastic before calling the next\n        await save_to_elastic(cr)\n\n        # the next call could be another middleware\n        return await self.next_call(cr)\n

Then, we have to include the middleware:

from kstreams import ConsumerRecord, middleware\n\nfrom .engine import stream_engine\n\n\nmiddlewares = [middleware.Middleware(ElasticMiddleware)]\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\n    async def processor(cr: ConsumerRecord):\n        ...\n

Note

The Middleware concept also applies for async generators (yield from a stream)

"},{"location":"middleware/#adding-extra-configuration-to-middlewares","title":"Adding extra configuration to middlewares","text":"

If you want to provide extra configuration to middleware you should override the init method with the extra options as keywargs and then call super().__init__(**kwargs)

Let's consider that we want to send an event to a spcific topic when a ValueError is raised inside a stream (Dead Letter Queue)

from kstreams import ConsumerRecord, types, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    def __init__(self, *, topic: str, **kwargs) -> None:\n        super().__init__(**kwargs)\n        self.topic = topic\n\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await self.send(self.topic, key=cr.key, value=cr.value)\n\n\n# Create the middlewares\nmiddlewares = [\n    middleware.Middleware(\n        DLQMiddleware, topic=\"kstreams-dlq-topic\"\n    )\n]\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\n    async def processor(cr: ConsumerRecord):\n        if cr.value == b\"joker\":\n            raise ValueError(\"Joker received...\")\n
"},{"location":"middleware/#default-middleware","title":"Default Middleware","text":"

This is always the first Middleware in the middleware stack to catch any exception that might occur. Any exception raised when consuming events that is not handled by the end user will be handled by this ExceptionMiddleware executing the policy_error that was stablished.

Source code in kstreams/middleware/middleware.py
class ExceptionMiddleware(BaseMiddleware):\n    \"\"\"\n    This is always the first Middleware in the middleware stack\n    to catch any exception that might occur. Any exception raised\n    when consuming events that is not handled by the end user\n    will be handled by this ExceptionMiddleware executing the\n    policy_error that was stablished.\n    \"\"\"\n\n    def __init__(\n        self, *, engine: \"StreamEngine\", error_policy: StreamErrorPolicy, **kwargs\n    ) -> None:\n        super().__init__(**kwargs)\n        self.engine = engine\n        self.error_policy = error_policy\n\n    async def __call__(self, cr: types.ConsumerRecord) -> typing.Any:\n        try:\n            return await self.next_call(cr)\n        except Exception as exc:\n            logger.exception(\n                \"Unhandled error occurred while listening to the stream. \"\n                f\"Stream consuming from topics {self.stream.topics} CRASHED!!! \\n\\n \"\n            )\n            if sys.version_info >= (3, 11):\n                exc.add_note(f\"Handler: {self.stream.func}\")\n                exc.add_note(f\"Topics: {self.stream.topics}\")\n\n            await self.cleanup_policy(exc)\n\n    async def cleanup_policy(self, exc: Exception) -> None:\n        \"\"\"\n        Execute clenup policicy according to the Stream configuration.\n\n        At this point we are inside the asyncio.Lock `is_processing`\n        as an event is being processed and an exeption has occured.\n        The Lock must be released to stop the Stream\n        (which must happen for any policy), then before re-raising\n        the exception the Lock must be acquire again to continue the processing\n\n        Exception and policies:\n\n            - STOP: The exception is re-raised as the Stream will be stopped\n              and the end user will deal with it\n\n            - STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n              (all Streams and Producer) and the end user will deal with it\n\n            - RESTART: The exception is not re-raised as the Stream\n              will recover and continue the processing. The logger.exception\n              from __call__ will record that something went wrong\n\n            - STOP_APPLICATION: The exception is not re-raised as the entire\n              application will be stopped. This is only useful when using kstreams\n              with another library like FastAPI. The logger.exception\n              from __call__ will record that something went wrong\n\n        Args:\n            exc (Exception): Any Exception that causes the Stream to crash\n\n        Raises:\n            exc: Exception is the policy is `STOP` or `STOP_ENGINE`\n        \"\"\"\n        self.stream.is_processing.release()\n\n        if self.error_policy == StreamErrorPolicy.RESTART:\n            await self.stream.stop()\n            await self.stream.start()\n        elif self.error_policy == StreamErrorPolicy.STOP:\n            await self.stream.stop()\n            # acquire `is_processing` Lock again to resume processing\n            # and avoid `RuntimeError: Lock is not acquired.`\n            await self.stream.is_processing.acquire()\n            raise exc\n        elif self.error_policy == StreamErrorPolicy.STOP_ENGINE:\n            await self.engine.stop()\n            # acquire `is_processing` Lock again to resume processing\n            # and avoid `RuntimeError: Lock is not acquired.`\n            await self.stream.is_processing.acquire()\n            raise exc\n        else:\n            # STOP_APPLICATION\n            await self.engine.stop()\n            await self.stream.is_processing.acquire()\n            signal.raise_signal(signal.SIGTERM)\n
"},{"location":"middleware/#kstreams.middleware.middleware.ExceptionMiddleware.cleanup_policy","title":"cleanup_policy(exc) async","text":"

Execute clenup policicy according to the Stream configuration.

At this point we are inside the asyncio.Lock is_processing as an event is being processed and an exeption has occured. The Lock must be released to stop the Stream (which must happen for any policy), then before re-raising the exception the Lock must be acquire again to continue the processing

Exception and policies:

- STOP: The exception is re-raised as the Stream will be stopped\n  and the end user will deal with it\n\n- STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n  (all Streams and Producer) and the end user will deal with it\n\n- RESTART: The exception is not re-raised as the Stream\n  will recover and continue the processing. The logger.exception\n  from __call__ will record that something went wrong\n\n- STOP_APPLICATION: The exception is not re-raised as the entire\n  application will be stopped. This is only useful when using kstreams\n  with another library like FastAPI. The logger.exception\n  from __call__ will record that something went wrong\n

Parameters:

Name Type Description Default exc Exception

Any Exception that causes the Stream to crash

required

Raises:

Type Description exc

Exception is the policy is STOP or STOP_ENGINE

Source code in kstreams/middleware/middleware.py
async def cleanup_policy(self, exc: Exception) -> None:\n    \"\"\"\n    Execute clenup policicy according to the Stream configuration.\n\n    At this point we are inside the asyncio.Lock `is_processing`\n    as an event is being processed and an exeption has occured.\n    The Lock must be released to stop the Stream\n    (which must happen for any policy), then before re-raising\n    the exception the Lock must be acquire again to continue the processing\n\n    Exception and policies:\n\n        - STOP: The exception is re-raised as the Stream will be stopped\n          and the end user will deal with it\n\n        - STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n          (all Streams and Producer) and the end user will deal with it\n\n        - RESTART: The exception is not re-raised as the Stream\n          will recover and continue the processing. The logger.exception\n          from __call__ will record that something went wrong\n\n        - STOP_APPLICATION: The exception is not re-raised as the entire\n          application will be stopped. This is only useful when using kstreams\n          with another library like FastAPI. The logger.exception\n          from __call__ will record that something went wrong\n\n    Args:\n        exc (Exception): Any Exception that causes the Stream to crash\n\n    Raises:\n        exc: Exception is the policy is `STOP` or `STOP_ENGINE`\n    \"\"\"\n    self.stream.is_processing.release()\n\n    if self.error_policy == StreamErrorPolicy.RESTART:\n        await self.stream.stop()\n        await self.stream.start()\n    elif self.error_policy == StreamErrorPolicy.STOP:\n        await self.stream.stop()\n        # acquire `is_processing` Lock again to resume processing\n        # and avoid `RuntimeError: Lock is not acquired.`\n        await self.stream.is_processing.acquire()\n        raise exc\n    elif self.error_policy == StreamErrorPolicy.STOP_ENGINE:\n        await self.engine.stop()\n        # acquire `is_processing` Lock again to resume processing\n        # and avoid `RuntimeError: Lock is not acquired.`\n        await self.stream.is_processing.acquire()\n        raise exc\n    else:\n        # STOP_APPLICATION\n        await self.engine.stop()\n        await self.stream.is_processing.acquire()\n        signal.raise_signal(signal.SIGTERM)\n
"},{"location":"middleware/#middleware-chain","title":"Middleware chain","text":"

It is possible to add as many middlewares as you want to split and reuse business logic, however the downside is extra complexity and the code might become slower. The middleware order is important as they are evaluated in the order that were placed in the stream.

In the following example we are adding three middelwares in the following order: DLQMiddleware, ElasticMiddleware, and S3Middleware. The code chain execution will be:

sequenceDiagram\n    autonumber\n    ExceptionMiddleware->>DLQMiddleware: \n    Note left of ExceptionMiddleware: Event received\n    alt No Processing Error\n    DLQMiddleware->>ElasticMiddleware: \n    Note right of ElasticMiddleware: Store CR on Elastic\n    ElasticMiddleware->>S3Middleware: \n    Note right of S3Middleware: Store CR on S3\n    S3Middleware->>Stream: \n    Note right of Stream: CR processed\n    Stream-->>S3Middleware: \n    S3Middleware-->>ElasticMiddleware: \n    ElasticMiddleware-->>DLQMiddleware: \n    DLQMiddleware-->>ExceptionMiddleware: \n    end
Multiple middlewares example
from kstreams import ConsumerRecord, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await dlq(cr.value)\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        await save_to_elastic(cr.value)\n        return await self.next_call(cr)\n\n\nclass S3Middleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        await backup_to_s3(cr.value)\n        return await self.next_call(cr)\n\n\nmiddlewares = [\n    middleware.Middleware(DLQMiddleware),\n    middleware.Middleware(ElasticMiddleware),\n    middleware.Middleware(S3Middleware),\n]\n\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\nasync def processor(cr: ConsumerRecord):\n    if cr.value == event_2:\n        raise ValueError(\"Error from stream...\")\n    await save_to_db(cr.value)\n

Note

In the example we can see that always the cr will be save into elastic and s3 regardless an error

"},{"location":"middleware/#executing-code-after-the-cr-was-processed","title":"Executing Code after the CR was processed","text":"

As mentioned in the introduction, it is possible to execute code after the CR is handled. To do this, we need to place code after next_call is called:

Execute code after CR is handled
from kstreams import ConsumerRecord, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await dlq(cr.value)\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        return await self.next_call(cr)\n        # This will be called after the whole chain has finished\n        await save_to_elastic(cr.value)\n\n\nmiddlewares = [\n    middleware.Middleware(DLQMiddleware),\n    middleware.Middleware(ElasticMiddleware),\n]\n\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\nasync def processor(cr: ConsumerRecord):\n    if cr.value == event_2:\n        raise ValueError(\"Error from stream...\")\n    await save_to_db(cr.value)\n

Note

In the example we can see that only if there is not an error the event is saved to elastic

"},{"location":"middleware/#deserialization","title":"Deserialization","text":"

To deserialize bytes into a different structure like dict middlewares are the preferred way to it. Examples:

Source code in examples/dataclasses-avroschema-example/dataclasses_avroschema_example/middlewares.py
class AvroDeserializerMiddleware(middleware.BaseMiddleware):\n    def __init__(self, *, model: AvroModel, **kwargs) -> None:\n        super().__init__(**kwargs)\n        self.model = model\n\n    async def __call__(self, cr: ConsumerRecord):\n        \"\"\"\n        Deserialize a payload to an AvroModel\n        \"\"\"\n        if cr.value is not None:\n            data = self.model.deserialize(cr.value)\n            cr.value = data\n        return await self.next_call(cr)\n
Source code in examples/confluent-example/confluent_example/middlewares.py
class ConfluentMiddlewareDeserializer(\n    middleware.BaseMiddleware, AsyncAvroMessageSerializer\n):\n    def __init__(\n        self,\n        *,\n        schema_registry_client: AsyncSchemaRegistryClient,\n        reader_schema: Optional[schema.AvroSchema] = None,\n        return_record_name: bool = False,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.schemaregistry_client = schema_registry_client\n        self.reader_schema = reader_schema\n        self.return_record_name = return_record_name\n        self.id_to_decoder_func: Dict = {}\n        self.id_to_writers: Dict = {}\n\n    async def __call__(self, cr: ConsumerRecord):\n        \"\"\"\n        Deserialize the event to a dict\n        \"\"\"\n        data = await self.decode_message(cr.value)\n        cr.value = data\n        return await self.next_call(cr)\n
"},{"location":"monitoring/","title":"Monitoring","text":"

This page discusses how to monitor your application using the Kafka metrics that are accessible in Prometheus.

Before we begin, it's crucial to note that Kafka itself makes a number of useful metrics available, including the cluster, broker, and clients (producer and consumers).

This means that we can quickly add some graphs to our dashboards by utilizing the already-exposed metrics.

Kstreams includes a collection of metrics. See Metrics Docs for more information.

"},{"location":"monitoring/#kstreams.PrometheusMonitor","title":"kstreams.PrometheusMonitor","text":"

Metrics monitor to keep track of Producers and Consumers.

Attributes: metrics_scrape_time float: Amount of seconds that the monitor will wait until next scrape iteration

Source code in kstreams/prometheus/monitor.py
class PrometheusMonitor:\n    \"\"\"\n    Metrics monitor to keep track of Producers and Consumers.\n\n     Attributes:\n        metrics_scrape_time float: Amount of seconds that the monitor\n            will wait until next scrape iteration\n    \"\"\"\n\n    # Producer metrics\n    MET_OFFSETS = Gauge(\n        \"topic_partition_offsets\", \"help producer offsets\", [\"topic\", \"partition\"]\n    )\n\n    # Consumer metrics\n    MET_COMMITTED = Gauge(\n        \"consumer_committed\",\n        \"help consumer committed\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_POSITION = Gauge(\n        \"consumer_position\",\n        \"help consumer position\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_HIGHWATER = Gauge(\n        \"consumer_highwater\",\n        \"help consumer highwater\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_LAG = Gauge(\n        \"consumer_lag\",\n        \"help consumer lag calculated using the last commited offset\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_POSITION_LAG = Gauge(\n        \"position_lag\",\n        \"help consumer position lag calculated using the consumer position\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n\n    def __init__(self, metrics_scrape_time: float = 3):\n        self.metrics_scrape_time = metrics_scrape_time\n        self.running = False\n        self._producer = None\n        self._streams: List[Stream] = []\n\n    async def start(self) -> None:\n        self.running = True\n        logger.info(\"Starting Prometheus Monitoring started...\")\n        await self._metrics_task()\n\n    async def stop(self) -> None:\n        self.running = False\n        self._clean_consumer_metrics()\n        logger.info(\"Prometheus Monitoring stopped...\")\n\n    def add_topic_partition_offset(\n        self, topic: str, partition: int, offset: int\n    ) -> None:\n        self.MET_OFFSETS.labels(topic=topic, partition=partition).set(offset)\n\n    def _add_consumer_metrics(self, metrics_dict: MetricsType):\n        for topic_partition, partitions_metadata in metrics_dict.items():\n            group_id = partitions_metadata[\"group_id\"]\n            position = partitions_metadata[\"position\"]\n            committed = partitions_metadata[\"committed\"]\n            highwater = partitions_metadata[\"highwater\"]\n            lag = partitions_metadata[\"lag\"]\n            position_lag = partitions_metadata[\"position_lag\"]\n\n            self.MET_COMMITTED.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(committed or 0)\n            self.MET_POSITION.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(position or -1)\n            self.MET_HIGHWATER.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(highwater or 0)\n            self.MET_LAG.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(lag or 0)\n            self.MET_POSITION_LAG.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(position_lag or 0)\n\n    def _clean_consumer_metrics(self) -> None:\n        \"\"\"\n        This method should be called when a rebalance takes place\n        to clean all consumers metrics. When the rebalance finishes\n        new metrics will be generated per consumer based on the\n        consumer assigments\n        \"\"\"\n        self.MET_LAG.clear()\n        self.MET_POSITION_LAG.clear()\n        self.MET_COMMITTED.clear()\n        self.MET_POSITION.clear()\n        self.MET_HIGHWATER.clear()\n\n    def clean_stream_consumer_metrics(self, consumer: Consumer) -> None:\n        topic_partitions = consumer.assignment()\n        group_id = consumer._group_id\n        for topic_partition in topic_partitions:\n            topic = topic_partition.topic\n            partition = topic_partition.partition\n\n            metrics_found = False\n            for sample in list(self.MET_LAG.collect())[0].samples:\n                if {\n                    \"topic\": topic,\n                    \"partition\": str(partition),\n                    \"consumer_group\": group_id,\n                } == sample.labels:\n                    metrics_found = True\n\n            if metrics_found:\n                self.MET_LAG.remove(topic, partition, group_id)\n                self.MET_POSITION_LAG.remove(topic, partition, group_id)\n                self.MET_COMMITTED.remove(topic, partition, group_id)\n                self.MET_POSITION.remove(topic, partition, group_id)\n                self.MET_HIGHWATER.remove(topic, partition, group_id)\n            else:\n                logger.debug(\n                    \"Metrics for consumer with group-id: \"\n                    f\"{consumer._group_id} not found\"\n                )\n\n    def add_producer(self, producer):\n        self._producer = producer\n\n    def add_streams(self, streams):\n        self._streams = streams\n\n    async def generate_consumer_metrics(self, consumer: Consumer):\n        \"\"\"\n        Generate Consumer Metrics for Prometheus\n\n        Format:\n            {\n                \"topic-1\": {\n                    \"1\": (\n                        [topic-1, partition-number, 'group-id-1'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                    \"2\": (\n                        [topic-1, partition-number, 'group-id-1'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                },\n                ...\n                \"topic-n\": {\n                    \"1\": (\n                        [topic-n, partition-number, 'group-id-n'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                    \"2\": (\n                        [topic-n, partition-number, 'group-id-n'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                }\n            }\n        \"\"\"\n        metrics: MetricsType = DefaultDict(dict)\n\n        topic_partitions = consumer.assignment()\n\n        for topic_partition in topic_partitions:\n            committed = await consumer.committed(topic_partition) or 0\n            position = await consumer.position(topic_partition)\n            highwater = consumer.highwater(topic_partition)\n\n            lag = position_lag = None\n            if highwater:\n                lag = highwater - committed\n                position_lag = highwater - position\n\n            metrics[topic_partition] = {\n                \"group_id\": consumer._group_id,\n                \"committed\": committed,\n                \"position\": position,\n                \"highwater\": highwater,\n                \"lag\": lag,\n                \"position_lag\": position_lag,\n            }\n\n        self._add_consumer_metrics(metrics)\n\n    async def _metrics_task(self) -> None:\n        \"\"\"\n        Task that runs in `backgroud` to generate\n        consumer metrics.\n\n        When self.running is False the task will finish and it\n        will be safe to stop consumers and producers.\n        \"\"\"\n        while self.running:\n            await asyncio.sleep(self.metrics_scrape_time)\n            for stream in self._streams:\n                if stream.consumer is not None:\n                    try:\n                        await self.generate_consumer_metrics(stream.consumer)\n                    except RuntimeError:\n                        logger.debug(\n                            f\"Metrics for stream {stream.name} can not be generated \"\n                            \"probably because it has been removed\"\n                        )\n
"},{"location":"monitoring/#kstreams.PrometheusMonitor.generate_consumer_metrics","title":"generate_consumer_metrics(consumer) async","text":"

Generate Consumer Metrics for Prometheus

Format

{ \"topic-1\": { \"1\": ( [topic-1, partition-number, 'group-id-1'], committed, position, highwater, lag, position_lag ) \"2\": ( [topic-1, partition-number, 'group-id-1'], committed, position, highwater, lag, position_lag ) }, ... \"topic-n\": { \"1\": ( [topic-n, partition-number, 'group-id-n'], committed, position, highwater, lag, position_lag ) \"2\": ( [topic-n, partition-number, 'group-id-n'], committed, position, highwater, lag, position_lag ) } }

Source code in kstreams/prometheus/monitor.py
async def generate_consumer_metrics(self, consumer: Consumer):\n    \"\"\"\n    Generate Consumer Metrics for Prometheus\n\n    Format:\n        {\n            \"topic-1\": {\n                \"1\": (\n                    [topic-1, partition-number, 'group-id-1'],\n                    committed, position, highwater, lag, position_lag\n                )\n                \"2\": (\n                    [topic-1, partition-number, 'group-id-1'],\n                    committed, position, highwater, lag, position_lag\n                )\n            },\n            ...\n            \"topic-n\": {\n                \"1\": (\n                    [topic-n, partition-number, 'group-id-n'],\n                    committed, position, highwater, lag, position_lag\n                )\n                \"2\": (\n                    [topic-n, partition-number, 'group-id-n'],\n                    committed, position, highwater, lag, position_lag\n                )\n            }\n        }\n    \"\"\"\n    metrics: MetricsType = DefaultDict(dict)\n\n    topic_partitions = consumer.assignment()\n\n    for topic_partition in topic_partitions:\n        committed = await consumer.committed(topic_partition) or 0\n        position = await consumer.position(topic_partition)\n        highwater = consumer.highwater(topic_partition)\n\n        lag = position_lag = None\n        if highwater:\n            lag = highwater - committed\n            position_lag = highwater - position\n\n        metrics[topic_partition] = {\n            \"group_id\": consumer._group_id,\n            \"committed\": committed,\n            \"position\": position,\n            \"highwater\": highwater,\n            \"lag\": lag,\n            \"position_lag\": position_lag,\n        }\n\n    self._add_consumer_metrics(metrics)\n
"},{"location":"monitoring/#consumer-metrics","title":"Consumer Metrics","text":"

We advise including the consumer_lag in your application's grafana dashboard.

consumer_lag will show you how far your consumers are lagging behind the published events in the topic they are reading. For instance, if you have a single consumer and another team is producing millions of events, the consumer might not be able to handle them in time (where in time is defined by you, like: \"in an hour of receiving a message it should be consumed\").

Based on the lag, you will have to develop your own alerts. An alert should be pushed to Slack if you experience more than a particular amount of lag.

You will require your consumer_group name in order to design a basic dashboard using the consumer_lag.

We could add a query in Grafana like this:

sum(kafka_consumer_group_ConsumerLagMetrics_Value{topic =~ \"YOUR_OWN_TOPIC_NAME\", groupId =~\"YOUR_CONSUMER_GROUP\", name=\"SumOffsetLag\"}) by (topic)\n

Remember to replace YOUR_CONSUMER_GROUP and YOUR_OWN_TOPIC_NAME with your consumer_group and topic respectively \u2b06\ufe0f

"},{"location":"monitoring/#producer-metrics","title":"Producer Metrics","text":"

If you have producers, it's a good idea to monitor the growth of Log End Offset (LEO).

The increase in LEO indicates the number of events produced in the last N minutes.

If you know that events should occur every N minutes, you can trigger alerts if no events occur because this metric will tell you whether or not events occurred.

We could add a query in Grafana like this, where N is 10m:

sum(max(increase(kafka_log_Log_Value{name=\"LogEndOffset\", topic =~ \"TOPIC_NAME\"}[10m])) by (partition, topic)) by (topic)\n

Remember to modify TOPIC_NAME to the name of the topic you want to track \u2b06\ufe0f

"},{"location":"monitoring/#custom-business-metrics","title":"Custom Business Metrics","text":"

One benefit of Prometheus is that you can design your own custom metrics.

Scenario: Consider an event-based ordering system. Assume you receive X orders daily and ship Y orders daily. Most likely, you will create a dashboard using this data.

Fortunately, we can create our own custom metrics by using the Prometheus Python client.

You can construct a variety of metrics with prometheus:

  • Gauge
  • Counter
  • Histogram
  • Summary

You can read more about it in prometheus metric_types website.

In our scenario, we will most likely want a Counter for orders received and a Counter for orders shipped.

from prometheus_client import Counter\nfrom kstreams import PrometheusMonitor\n\nclass MyAppPrometheusMonitor(PrometheusMonitor):\n    def __init__(self):\n        super().__init__() # initialize kstream metrics\n        self.orders_received = Counter('orders_received', 'Amount of orders received')\n        self.orders_shipped = Counter('orders_shipped', 'Amount of orders shipped')\n\n    def increase_received(self, amount: int = 1):\n        self.orders_received.inc(amount)\n\n    def increase_shipped(self, amount: int = 1):\n        self.orders_shipped.inc(amount)\n

In our kstreams app, we can:

stream_engine = create_engine(title=\"my-engine\", monitor=MyAppPrometheusMonitor())\n\n@stream_engine.stream(\"my-special-orders\")\nasync def consume_orders_received(cr: ConsumerRecord):\n    if cr.value.status == \"NEW\":\n        stream_engine.monitor.increase_received()\n    elif cr.value.status == \"SHIPPED\":\n        stream_engine.monitor.increase_shipped()\n

Your app's prometheus would display this data, which you might utilize to build a stylish \u2728dashboard\u2728 interface.

For further details, see the Prometheus python client documentation.

"},{"location":"serialization/","title":"Serialization","text":"

Kafka's job is to move bytes from producer to consumers, through a topic.

By default, this is what kstream does.

from kstreams import Stream\n\nfrom .streams_roster import stream_roster\n\nmy_stream = Stream(\n    \"local--hello-world\",\n    func=stream_roster,\n    config={\n        \"group_id\": \"example-group\",\n    },\n)\n

As you can see the ConsumerRecord's value is bytes.

In order to keep your code pythonic, we provide a mechanism to serialize/deserialize these bytes, into something more useful. This way, you can work with other data structures, like a dict or dataclasses.

Sometimes it is easier to work with a dict in your app, give it to kstreams, and let it transform it into bytes to be delivered to Kafka. For this situation, you need to implement kstreams.serializers.Serializer.

The other situation is when you consume from Kafka (or other brokers). Instead of dealing with bytes, you may want to receive in your function the dict ready to be used. For those cases, we need to use middleware. For example, we can implement a JsonMiddleware:

from kstreams import middleware, ConsumerRecord\n\n\nclass JsonDeserializerMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        if cr.value is not None:\n            data = json.loads(cr.value.decode())\n            cr.value = data\n        return await self.next_call(cr)\n

It is also possble to use kstreams.serializers.Deserializer for deserialization, but this will be deprecated

Warning

kstreams.serializers.Deserializer will be deprecated, use middlewares instead

"},{"location":"serialization/#kstreams.serializers.Serializer","title":"kstreams.serializers.Serializer","text":"

Protocol used by the Stream to serialize.

A Protocol is similar to other languages features like an interface or a trait.

End users should provide their own class implementing this protocol.

For example a JsonSerializer

from typing import Optional, Dict\nimport json\n\nclass JsonSerializer:\n\n    async def serialize(\n        self,\n        payload: dict,\n        headers: Optional[Dict[str, str]] = None,\n        serializer_kwargs: Optional[Dict] = None,\n    ) -> bytes:\n        \"\"\"Return UTF-8 encoded payload\"\"\"\n        value = json.dumps(payload)\n        return value.encode()\n

Notice that you don't need to inherit anything, you just have to comply with the Protocol.

Source code in kstreams/serializers.py
class Serializer(Protocol):\n    \"\"\"Protocol used by the Stream to serialize.\n\n    A Protocol is similar to other languages features like an interface or a trait.\n\n    End users should provide their own class implementing this protocol.\n\n    For example a `JsonSerializer`\n\n    ```python\n    from typing import Optional, Dict\n    import json\n\n    class JsonSerializer:\n\n        async def serialize(\n            self,\n            payload: dict,\n            headers: Optional[Dict[str, str]] = None,\n            serializer_kwargs: Optional[Dict] = None,\n        ) -> bytes:\n            \\\"\"\"Return UTF-8 encoded payload\\\"\"\"\n            value = json.dumps(payload)\n            return value.encode()\n    ```\n\n    Notice that you don't need to inherit anything,\n    you just have to comply with the Protocol.\n    \"\"\"\n\n    async def serialize(\n        self,\n        payload: Any,\n        headers: Optional[Headers] = None,\n        serializer_kwargs: Optional[Dict] = None,\n    ) -> bytes:\n        \"\"\"\n        Implement this method to deserialize the data received from the topic.\n        \"\"\"\n        ...\n
"},{"location":"serialization/#kstreams.serializers.Serializer.serialize","title":"serialize(payload, headers=None, serializer_kwargs=None) async","text":"

Implement this method to deserialize the data received from the topic.

Source code in kstreams/serializers.py
async def serialize(\n    self,\n    payload: Any,\n    headers: Optional[Headers] = None,\n    serializer_kwargs: Optional[Dict] = None,\n) -> bytes:\n    \"\"\"\n    Implement this method to deserialize the data received from the topic.\n    \"\"\"\n    ...\n
"},{"location":"serialization/#kstreams.serializers.Deserializer","title":"kstreams.serializers.Deserializer","text":"

Protocol used by the Stream to deserialize.

A Protocol is similar to other languages features like an interface or a trait.

End users should provide their own class implementing this protocol.

For example a JsonDeserializer

import json\nfrom kstreams import ConsumerRecord\n\nclass JsonDeserializer:\n\n    async def deserialize(\n        self, consumer_record: ConsumerRecord, **kwargs\n    ) -> ConsumerRecord:\n        data = json.loads(consumer_record.value.decode())\n        consumer_record.value = data\n        return consumer_record\n
Source code in kstreams/serializers.py
class Deserializer(Protocol):\n    \"\"\"Protocol used by the Stream to deserialize.\n\n    A Protocol is similar to other languages features like an interface or a trait.\n\n    End users should provide their own class implementing this protocol.\n\n    For example a `JsonDeserializer`\n\n    ```python\n    import json\n    from kstreams import ConsumerRecord\n\n    class JsonDeserializer:\n\n        async def deserialize(\n            self, consumer_record: ConsumerRecord, **kwargs\n        ) -> ConsumerRecord:\n            data = json.loads(consumer_record.value.decode())\n            consumer_record.value = data\n            return consumer_record\n    ```\n    \"\"\"\n\n    async def deserialize(\n        self, consumer_record: ConsumerRecord, **kwargs\n    ) -> ConsumerRecord:\n        \"\"\"\n        Implement this method to deserialize the data received from the topic.\n        \"\"\"\n        ...\n
"},{"location":"serialization/#kstreams.serializers.Deserializer.deserialize","title":"deserialize(consumer_record, **kwargs) async","text":"

Implement this method to deserialize the data received from the topic.

Source code in kstreams/serializers.py
async def deserialize(\n    self, consumer_record: ConsumerRecord, **kwargs\n) -> ConsumerRecord:\n    \"\"\"\n    Implement this method to deserialize the data received from the topic.\n    \"\"\"\n    ...\n
"},{"location":"serialization/#usage","title":"Usage","text":"

Once you have written your serializer or deserializer, there are 2 ways of using them, in a generic fashion or per stream.

"},{"location":"serialization/#initialize-the-engine-with-your-serializers","title":"Initialize the engine with your serializers","text":"

By doing this all the streams will use these serializers by default.

stream_engine = create_engine(\n    title=\"my-stream-engine\",\n    serializer=JsonSerializer(),\n)\n
"},{"location":"serialization/#initilize-streams-with-a-deserializer-and-produce-events-with-serializers","title":"Initilize streams with a deserializer and produce events with serializers","text":"
from kstreams import middleware, ConsumerRecord\n\n\n@stream_engine.stream(topic, middlewares=[middleware.Middleware(JsonDeserializerMiddleware)])\nasync def hello_stream(cr: ConsumerRecord):\n    # remember event.value is now a dict\n    print(cr.value[\"message\"])\n    save_to_db(cr)\n
await stream_engine.send(\n    topic,\n    value={\"message\": \"test\"}\n    headers={\"content-type\": consts.APPLICATION_JSON,}\n    key=\"1\",\n)\n
"},{"location":"stream/","title":"Streams","text":"

A Stream in kstreams is an extension of AIOKafkaConsumer

Consuming can be done using kstreams.Stream. You only need to decorate a coroutine with @stream_engine.streams. The decorator has the same aiokafka consumer API at initialization, in other words they accept the same args and kwargs that the aiokafka consumer accepts.

"},{"location":"stream/#kstreams.streams.Stream","title":"kstreams.streams.Stream","text":"

Attributes:

Name Type Description name Optional[str]

Stream name. Default is a generated uuid4

topics List[str]

List of topics to consume

subscribe_by_pattern bool

Whether subscribe to topics by pattern

backend Kafka

backend kstreams.backends.kafka.Kafka: Backend to connect. Default Kafka

func Callable[[Stream], Awaitable[Any]]

Coroutine fucntion or generator to be called when an event arrives

config Dict[str, Any]

Stream configuration. Here all the properties can be passed in the dictionary

deserializer Deserializer

Deserializer to be used when an event is consumed

initial_offsets List[TopicPartitionOffset]

List of TopicPartitionOffset that will seek the initial offsets to

rebalance_listener RebalanceListener

Listener callbacks when partition are assigned or revoked

"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-a-topic","title":"Subscribe to a topic","text":"

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--kstreams\", group_id=\"my-group-id\")\nasync def stream(cr: ConsumerRecord) -> None:\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nasync def start():\n    await stream_engine.start()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(\n        start(),\n        stop_on_unhandled_errors=True,\n        shutdown_callback=shutdown\n    )\n
"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-multiple-topics","title":"Subscribe to multiple topics","text":"

Consuming from multiple topics using one stream is possible. A List[str] of topics must be provided.

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    [\"local--kstreams\", \"local--hello-world\"],\n    group_id=\"my-group-id\",\n)\nasync def consume(cr: ConsumerRecord) -> None:\n    print(f\"Event from {cr.topic}: headers: {cr.headers}, payload: {cr.value}\")\n
"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-topics-by-pattern","title":"Subscribe to topics by pattern","text":"

In the following example the stream will subscribe to any topic that matches the regex ^dev--customer-.*, for example dev--customer-invoice or dev--customer-profile. The subscribe_by_pattern flag must be set to True.

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    topics=\"^dev--customer-.*$\",\n    subscribe_by_pattern=True,\n    group_id=\"my-group-id\",\n)\nasync def stream(cr: ConsumerRecord) -> None:\n    if cr.topic == \"dev--customer-invoice\":\n        print(\"Event from topic dev--customer-invoice\"\n    elif cr.topic == \"dev--customer-profile\":\n        print(\"Event from topic dev--customer-profile\"\n    else:\n        raise ValueError(f\"Invalid topic {cr.topic}\")\n\n\nasync def start():\n    await stream_engine.start()\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(\n        start(),\n        stop_on_unhandled_errors=True,\n        shutdown_callback=shutdown\n    )\n
"},{"location":"stream/#dependency-injection","title":"Dependency Injection","text":"

The old way to itereate over a stream is with the async for _ in stream loop. The iterable approach works but in most cases end users are interested only in the ConsumerRecord, for this reason it is possible to remove the async for loop using proper typing hints. The available typing hints are:

  • ConsumerRecord: The aiokafka ConsumerRecord that will be received every time that a new event is in the Stream
  • Stream: The Stream object that is subscribed to the topic/s. Useful when manual commit is enabled or when other Stream operations are needed
  • Send: Coroutine to produce events. The same as stream_engine.send(...)

if you use type hints then every time that a new event is in the stream the coroutine function defined by the end user will ba awaited with the specified types

ConsumerRecordConsumerRecord and StreamConsumerRecord, Stream and SendOld fashion
@stream_engine.stream(topic)\nasync def my_stream(cr: ConsumerRecord):\n    print(cr.value)\n
@stream_engine.stream(topic, enable_auto_commit=False)\nasync def my_stream(cr: ConsumerRecord, stream: Stream):\n    print(cr.value)\n    await stream.commit()\n
@stream_engine.stream(topic, enable_auto_commit=False)\nasync def my_stream(cr: ConsumerRecord, stream: Stream, send: Send):\n    print(cr.value)\n    await stream.commit()\n    await send(\"sink-to-elastic-topic\", value=cr.value)\n
@stream_engine.stream(topic)\nasync def consume(stream):  # you can specify the type but it will be the same result\n    async for cr in stream:\n        print(cr.value)\n        # you can do something with the stream as well!!\n

Note

The type arguments can be in any order. This might change in the future.

Warning

It is still possible to use the async for in loop, but it might be removed in the future. Migrate to the typing approach

"},{"location":"stream/#creating-a-stream-instance","title":"Creating a Stream instance","text":"

If for any reason you need to create Streams instances directly, you can do it without using the decorator stream_engine.stream.

Stream instance
import aiorun\nfrom kstreams import create_engine, Stream, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\nclass MyDeserializer:\n\n    async def deserialize(self, consumer_record: ConsumerRecord, **kwargs):\n        return consumer_record.value.decode()\n\n\nasync def stream(cr: ConsumerRecord) -> None:\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nstream = Stream(\n    \"local--kstreams\",\n    name=\"my-stream\"\n    func=stream,  # coroutine or async generator\n    deserializer=MyDeserializer(),\n)\n# add the stream to the engine\nstream_engine.add_stream(stream)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n
"},{"location":"stream/#removing-a-stream-from-the-engine","title":"Removing a stream from the engine","text":"Removing stream
stream_engine.remove_stream(stream)\n
"},{"location":"stream/#starting-the-stream-with-initial-offsets","title":"Starting the stream with initial offsets","text":"

If you want to start your consumption from certain offsets, you can include that in your stream instantiation.

Use case: This feature is useful if one wants to manage their own offsets, rather than committing consumed offsets to Kafka. When an application manages its own offsets and tries to start a stream, we start the stream using the initial offsets as defined in the database.

If you try to seek on a partition or topic that is not assigned to your stream, the code will ignore the seek and print out a warning. For example, if you have two consumers that are consuming from different partitions, and you try to seek for all of the partitions on each consumer, each consumer will seek for the partitions it has been assigned, and it will print out a warning log for the ones it was not assigned.

If you try to seek on offsets that are not yet present on your partition, the consumer will revert to the auto_offset_reset config. There will not be a warning, so be aware of this.

Also be aware that when your application restarts, it most likely will trigger the initial_offsets again. This means that setting intial_offsets to be a hardcoded number might not get the results you expect.

Initial Offsets from Database
from kstreams import Stream, structs\n\n\ntopic_name = \"local--kstreams\"\ndb_table = ExampleDatabase()\ninitial_offset = structs.TopicPartitionOffset(topic=topic_name, partition=0, offset=db_table.offset)\n\n\nasync def my_stream(stream: Stream):\n    ...\n\n\nstream = Stream(\n    topic_name,\n    name=\"my-stream\",\n    func=my_stream,  # coroutine or async generator\n    deserializer=MyDeserializer(),\n    initial_offsets=[initial_offset],\n)\n
"},{"location":"stream/#stream-crashing","title":"Stream crashing","text":"

If your stream crashes for any reason the event consumption is stopped, meaning that non event will be consumed from the topic. However, it is possible to set three different error policies per stream:

  • StreamErrorPolicy.STOP (default): Stop the Stream when an exception occurs. The exception is raised after the stream is properly stopped.
  • StreamErrorPolicy.RESTART: Stop and restart the Stream when an exception occurs. The event that caused the exception is skipped. The exception is NOT raised because the application should contine working, however logger.exception() is used to alert the user.
  • StreamErrorPolicy.STOP_ENGINE: Stop the StreamEngine when an exception occurs. The exception is raised after ALL the Streams were properly stopped.
  • StreamErrorPolicy.STOP_APPLICATION: Stop the StreamEngine when an exception occurs and raises signal.SIGTERM. Useful when using kstreams with other libraries such us FastAPI.

In the following example, the StreamErrorPolicy.RESTART error policy is specifed. If the Stream crashed with the ValueError exception it is restarted:

from kstreams import create_engine, ConsumerRecord\nfrom kstreams.stream_utils import StreamErrorPolicy\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    \"local--hello-world\",\n    group_id=\"example-group\",\n    error_policy=StreamErrorPolicy.RESTART\n)\nasync def stream(cr: ConsumerRecord) -> None:\n    if cr.key == b\"error\":\n        # Stream will be restarted after the ValueError is raised\n        raise ValueError(\"error....\")\n\n    print(f\"Event consumed. Payload {cr.value}\")\n

We can see the logs:

ValueError: error....\nINFO:aiokafka.consumer.group_coordinator:LeaveGroup request succeeded\nINFO:aiokafka.consumer.consumer:Unsubscribed all topics or patterns and assigned partitions\nINFO:kstreams.streams:Stream consuming from topics ['local--hello-world'] has stopped!!! \n\n\nINFO:kstreams.middleware.middleware:Restarting stream <kstreams.streams.Stream object at 0x102d44050>\nINFO:aiokafka.consumer.subscription_state:Updating subscribed topics to: frozenset({'local--hello-world'})\n...\nINFO:aiokafka.consumer.group_coordinator:Setting newly assigned partitions {TopicPartition(topic='local--hello-world', partition=0)} for group example-group\n

Note

If you are using aiorun with stop_on_unhandled_errors=True and the error_policy is StreamErrorPolicy.RESTART then the application will NOT stop as the exception that caused the Stream to crash is not raised

"},{"location":"stream/#changing-consumer-behavior","title":"Changing consumer behavior","text":"

Most of the time you will only set the topic and the group_id to the consumer, but sometimes you might want more control over it, for example changing the policy for resetting offsets on OffsetOutOfRange errors or session timeout. To do this, you have to use the same kwargs as the aiokafka consumer API

# The consumer sends periodic heartbeats every 500 ms\n# On OffsetOutOfRange errors, the offset will move to the oldest available message (\u2018earliest\u2019)\n\n@stream_engine.stream(\"local--kstream\", group_id=\"de-my-partition\", session_timeout_ms=500, auto_offset_reset\"earliest\")\nasync def stream(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n
"},{"location":"stream/#manual-commit","title":"Manual commit","text":"

When processing more sensitive data and you want to be sure that the kafka offeset is commited once that you have done your tasks, you can use enable_auto_commit=False mode of Consumer.

Manual commit example
@stream_engine.stream(\"local--kstream\", group_id=\"de-my-partition\", enable_auto_commit=False)\nasync def stream(cr: ConsumerRecord, stream: Stream):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n    # We need to make sure that the pyalod was stored before commiting the kafka offset\n    await store_in_database(payload)\n    await stream.commit()  # You need to commit!!!\n

Note

This is a tradeoff from at most once to at least once delivery, to achieve exactly once you will need to save offsets in the destination database and validate those yourself.

"},{"location":"stream/#yield-from-stream","title":"Yield from stream","text":"

Sometimes is useful to yield values from a stream so you can consume events in your on phase or because you want to return results to the frontend (SSE example). If you use the yield keyword inside a coroutine it will be \"transform\" to a asynchronous generator function, meaning that inside there is an async generator and it can be consumed.

Consuming an async generator is simple, you just use the async for in clause. Because consuming events only happens with the for loop, you have to make sure that the Stream has been started properly and after leaving the async for in the stream has been properly stopped.

To facilitate the process, we have context manager that makes sure of the starting/stopping process.

Yield example
# Create your stream\n@stream_engine.stream(\"local--kstream\")\nasync def stream(cr: ConsumerRecord, stream: Stream):\n    yield cr.value\n\n\n# Consume the stream:\nasync with stream as stream_flow:  # Use the context manager\n    async for value in stream_flow:\n        ...\n        # do something with value (cr.value)\n

Note

If for some reason you interrupt the \"async for in\" in the async generator, the Stream will stopped consuming events meaning that the lag will increase.

Note

Yield from a stream only works with the typing approach

"},{"location":"stream/#get-many","title":"Get many","text":"

Get a batch of events from the assigned TopicPartition.

Prefetched events are returned in batches by topic-partition. If messages is not available in the prefetched buffer this method waits timeout_ms milliseconds.

Attributes:

Name Type Description partitions List[TopicPartition] | None

The partitions that need fetching message. If no one partition specified then all subscribed partitions will be used

timeout_ms int | None

milliseconds spent waiting if data is not available in the buffer. If 0, returns immediately with any records that are available currently in the buffer, else returns empty. Must not be negative.

max_records int | None

The amount of records to fetch. if timeout_ms was defined and reached and the fetched records has not reach max_records then returns immediately with any records that are available currently in the buffer

Returns:

Type Description Dict[TopicPartition, List[ConsumerRecord]]

Topic to list of records

Example

@stream_engine.stream(topic, ...)\nasync def stream(stream: Stream):\n    while True:\n        data = await stream.getmany(max_records=5)\n        print(data)\n
Source code in kstreams/streams.py
async def getmany(\n    self,\n    partitions: typing.Optional[typing.List[TopicPartition]] = None,\n    timeout_ms: int = 0,\n    max_records: typing.Optional[int] = None,\n) -> typing.Dict[TopicPartition, typing.List[ConsumerRecord]]:\n    \"\"\"\n    Get a batch of events from the assigned TopicPartition.\n\n    Prefetched events are returned in batches by topic-partition.\n    If messages is not available in the prefetched buffer this method waits\n    `timeout_ms` milliseconds.\n\n    Attributes:\n        partitions List[TopicPartition] | None: The partitions that need\n            fetching message. If no one partition specified then all\n            subscribed partitions will be used\n        timeout_ms int | None: milliseconds spent waiting if\n            data is not available in the buffer. If 0, returns immediately\n            with any records that are available currently in the buffer,\n            else returns empty. Must not be negative.\n        max_records int | None: The amount of records to fetch.\n            if `timeout_ms` was defined and reached and the fetched records\n            has not reach `max_records` then returns immediately\n            with any records that are available currently in the buffer\n\n    Returns:\n        Topic to list of records\n\n    !!! Example\n        ```python\n        @stream_engine.stream(topic, ...)\n        async def stream(stream: Stream):\n            while True:\n                data = await stream.getmany(max_records=5)\n                print(data)\n        ```\n    \"\"\"\n    partitions = partitions or []\n    return await self.consumer.getmany(  # type: ignore\n        *partitions, timeout_ms=timeout_ms, max_records=max_records\n    )\n

Warning

This approach does not works with Dependency Injection.

"},{"location":"stream/#rebalance-listener","title":"Rebalance Listener","text":"

For some cases you will need a RebalanceListener so when partitions are assigned or revoked to the stream different accions can be performed.

"},{"location":"stream/#use-cases","title":"Use cases","text":"
  • Cleanup or custom state save on the start of a rebalance operation
  • Saving offsets in a custom store when a partition is revoked
  • Load a state or cache warmup on completion of a successful partition re-assignment.
"},{"location":"stream/#metrics-rebalance-listener","title":"Metrics Rebalance Listener","text":"

Kstreams use a default listener for all the streams to clean the metrics after a rebalance takes place

"},{"location":"stream/#kstreams.MetricsRebalanceListener","title":"kstreams.MetricsRebalanceListener","text":"Source code in kstreams/rebalance_listener.py
class MetricsRebalanceListener(RebalanceListener):\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        This will method will clean up the `Prometheus` metrics\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n        \"\"\"\n        # lock all asyncio Tasks so no new metrics will be added to the Monitor\n        if revoked and self.engine is not None:\n            async with asyncio.Lock():\n                if self.stream is not None and self.stream.consumer is not None:\n                    self.engine.monitor.clean_stream_consumer_metrics(\n                        self.stream.consumer\n                    )\n\n    async def on_partitions_assigned(\n        self, assigned: typing.Set[TopicPartition]\n    ) -> None:\n        \"\"\"\n        Coroutine to be called *after* partition re-assignment completes\n        and *before* the consumer starts fetching data again.\n\n        This method will start the `Prometheus` metrics\n\n        Attributes:\n            assigned Set[TopicPartition]: Partitions assigned to the\n                consumer (may include partitions that were previously assigned)\n        \"\"\"\n        # lock all asyncio Tasks so no new metrics will be added to the Monitor\n        if assigned and self.engine is not None:\n            async with asyncio.Lock():\n                if self.stream is not None:\n                    self.stream.seek_to_initial_offsets()\n
"},{"location":"stream/#kstreams.MetricsRebalanceListener.on_partitions_assigned","title":"on_partitions_assigned(assigned) async","text":"

Coroutine to be called after partition re-assignment completes and before the consumer starts fetching data again.

This method will start the Prometheus metrics

Attributes:

Name Type Description assigned Set[TopicPartition]

Partitions assigned to the consumer (may include partitions that were previously assigned)

Source code in kstreams/rebalance_listener.py
async def on_partitions_assigned(\n    self, assigned: typing.Set[TopicPartition]\n) -> None:\n    \"\"\"\n    Coroutine to be called *after* partition re-assignment completes\n    and *before* the consumer starts fetching data again.\n\n    This method will start the `Prometheus` metrics\n\n    Attributes:\n        assigned Set[TopicPartition]: Partitions assigned to the\n            consumer (may include partitions that were previously assigned)\n    \"\"\"\n    # lock all asyncio Tasks so no new metrics will be added to the Monitor\n    if assigned and self.engine is not None:\n        async with asyncio.Lock():\n            if self.stream is not None:\n                self.stream.seek_to_initial_offsets()\n
"},{"location":"stream/#kstreams.MetricsRebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

This will method will clean up the Prometheus metrics

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    This will method will clean up the `Prometheus` metrics\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n    \"\"\"\n    # lock all asyncio Tasks so no new metrics will be added to the Monitor\n    if revoked and self.engine is not None:\n        async with asyncio.Lock():\n            if self.stream is not None and self.stream.consumer is not None:\n                self.engine.monitor.clean_stream_consumer_metrics(\n                    self.stream.consumer\n                )\n
"},{"location":"stream/#manual-commit_1","title":"Manual Commit","text":"

If manual commit is enabled, you migh want to use the ManualCommitRebalanceListener. This rebalance listener will call commit before the stream partitions are revoked to avoid the error CommitFailedError and duplicate message delivery after a rebalance. See code example with manual commit

Note

ManualCommitRebalanceListener also includes the MetricsRebalanceListener funcionality.

"},{"location":"stream/#kstreams.ManualCommitRebalanceListener","title":"kstreams.ManualCommitRebalanceListener","text":"Source code in kstreams/rebalance_listener.py
class ManualCommitRebalanceListener(MetricsRebalanceListener):\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        If manual commit is enabled, `commit` is called before the consumers\n        partitions are revoked to prevent the error `CommitFailedError`\n        and duplicate message delivery after a rebalance.\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n        \"\"\"\n        if (\n            revoked\n            and self.stream is not None\n            and self.stream.consumer is not None\n            and not self.stream.consumer._enable_auto_commit\n        ):\n            logger.info(\n                f\"Manual commit enabled for stream {self.stream}. \"\n                \"Performing `commit` before revoking partitions\"\n            )\n            async with asyncio.Lock():\n                await self.stream.commit()\n\n            await super().on_partitions_revoked(revoked=revoked)\n
"},{"location":"stream/#kstreams.ManualCommitRebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

If manual commit is enabled, commit is called before the consumers partitions are revoked to prevent the error CommitFailedError and duplicate message delivery after a rebalance.

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    If manual commit is enabled, `commit` is called before the consumers\n    partitions are revoked to prevent the error `CommitFailedError`\n    and duplicate message delivery after a rebalance.\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n    \"\"\"\n    if (\n        revoked\n        and self.stream is not None\n        and self.stream.consumer is not None\n        and not self.stream.consumer._enable_auto_commit\n    ):\n        logger.info(\n            f\"Manual commit enabled for stream {self.stream}. \"\n            \"Performing `commit` before revoking partitions\"\n        )\n        async with asyncio.Lock():\n            await self.stream.commit()\n\n        await super().on_partitions_revoked(revoked=revoked)\n
"},{"location":"stream/#custom-rebalance-listener","title":"Custom Rebalance Listener","text":"

If you want to define a custom RebalanceListener, it has to inherits from kstreams.RebalanceListener.

Note

It also possible to inherits from ManualCommitRebalanceListener and MetricsRebalanceListener

"},{"location":"stream/#kstreams.RebalanceListener","title":"kstreams.RebalanceListener","text":"

A callback interface that the user can implement to trigger custom actions when the set of partitions are assigned or revoked to the Stream.

Example

from kstreams import RebalanceListener, TopicPartition\nfrom .resource import stream_engine\n\n\nclass MyRebalanceListener(RebalanceListener):\n\n    async def on_partitions_revoked(\n        self, revoked: Set[TopicPartition]\n    ) -> None:\n        # Do something with the revoked partitions\n        # or with the Stream\n        print(self.stream)\n\n    async def on_partitions_assigned(\n        self, assigned: Set[TopicPartition]\n    ) -> None:\n        # Do something with the assigned partitions\n        # or with the Stream\n        print(self.stream)\n\n\n@stream_engine.stream(topic, rebalance_listener=MyRebalanceListener())\nasync def my_stream(stream: Stream):\n    async for event in stream:\n        ...\n
Source code in kstreams/rebalance_listener.py
class RebalanceListener(ConsumerRebalanceListener):\n    \"\"\"\n    A callback interface that the user can implement to trigger custom actions\n    when the set of partitions are assigned or revoked to the `Stream`.\n\n    !!! Example\n        ```python\n        from kstreams import RebalanceListener, TopicPartition\n        from .resource import stream_engine\n\n\n        class MyRebalanceListener(RebalanceListener):\n\n            async def on_partitions_revoked(\n                self, revoked: Set[TopicPartition]\n            ) -> None:\n                # Do something with the revoked partitions\n                # or with the Stream\n                print(self.stream)\n\n            async def on_partitions_assigned(\n                self, assigned: Set[TopicPartition]\n            ) -> None:\n                # Do something with the assigned partitions\n                # or with the Stream\n                print(self.stream)\n\n\n        @stream_engine.stream(topic, rebalance_listener=MyRebalanceListener())\n        async def my_stream(stream: Stream):\n            async for event in stream:\n                ...\n        ```\n    \"\"\"\n\n    def __init__(self) -> None:\n        self.stream: typing.Optional[\"Stream\"] = None\n        # engine added so it can react on rebalance events\n        self.engine: typing.Optional[\"StreamEngine\"] = None\n\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        If you are using manual commit you have to commit all consumed offsets\n        here, to avoid duplicate message delivery after rebalance is finished.\n\n        Use cases:\n            - cleanup or custom state save on the start of a rebalance operation\n            - saving offsets in a custom store\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n\n        !!! note\n            The `Stream` is available using `self.stream`\n        \"\"\"\n        ...  # pragma: no cover\n\n    async def on_partitions_assigned(\n        self, assigned: typing.Set[TopicPartition]\n    ) -> None:\n        \"\"\"\n        Coroutine to be called *after* partition re-assignment completes\n        and *before* the consumer starts fetching data again.\n\n        It is guaranteed that all the processes in a consumer group will\n        execute their `on_partitions_revoked` callback before any instance\n        executes its `on_partitions_assigned` callback.\n\n        Use cases:\n            - Load a state or cache warmup on completion of a successful\n            partition re-assignment.\n\n        Attributes:\n            assigned Set[TopicPartition]: Partitions assigned to the\n                consumer (may include partitions that were previously assigned)\n\n        !!! note\n            The `Stream` is available using `self.stream`\n        \"\"\"\n        ...  # pragma: no cover\n
"},{"location":"stream/#kstreams.RebalanceListener.on_partitions_assigned","title":"on_partitions_assigned(assigned) async","text":"

Coroutine to be called after partition re-assignment completes and before the consumer starts fetching data again.

It is guaranteed that all the processes in a consumer group will execute their on_partitions_revoked callback before any instance executes its on_partitions_assigned callback.

Use cases
  • Load a state or cache warmup on completion of a successful partition re-assignment.

Attributes:

Name Type Description assigned Set[TopicPartition]

Partitions assigned to the consumer (may include partitions that were previously assigned)

Note

The Stream is available using self.stream

Source code in kstreams/rebalance_listener.py
async def on_partitions_assigned(\n    self, assigned: typing.Set[TopicPartition]\n) -> None:\n    \"\"\"\n    Coroutine to be called *after* partition re-assignment completes\n    and *before* the consumer starts fetching data again.\n\n    It is guaranteed that all the processes in a consumer group will\n    execute their `on_partitions_revoked` callback before any instance\n    executes its `on_partitions_assigned` callback.\n\n    Use cases:\n        - Load a state or cache warmup on completion of a successful\n        partition re-assignment.\n\n    Attributes:\n        assigned Set[TopicPartition]: Partitions assigned to the\n            consumer (may include partitions that were previously assigned)\n\n    !!! note\n        The `Stream` is available using `self.stream`\n    \"\"\"\n    ...  # pragma: no cover\n
"},{"location":"stream/#kstreams.RebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

If you are using manual commit you have to commit all consumed offsets here, to avoid duplicate message delivery after rebalance is finished.

Use cases
  • cleanup or custom state save on the start of a rebalance operation
  • saving offsets in a custom store

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Note

The Stream is available using self.stream

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    If you are using manual commit you have to commit all consumed offsets\n    here, to avoid duplicate message delivery after rebalance is finished.\n\n    Use cases:\n        - cleanup or custom state save on the start of a rebalance operation\n        - saving offsets in a custom store\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n\n    !!! note\n        The `Stream` is available using `self.stream`\n    \"\"\"\n    ...  # pragma: no cover\n
"},{"location":"test_client/","title":"Testing","text":"

To test streams and producers or perform e2e tests you can make use of the test_utils.TestStreamClient.

The TestStreamClient aims to emulate as much as possible the kafka behaviour using asyncio.Queue. This is excellent because you can test quite easily your code without spinning up kafka, but this comes with some limitations. It is not possible to know beforehand how many topics exist, how many partitions per topic exist, the replication factor, current offsets, etc. So, the test client will create topics, partitions, assigments, etc on runtime. Each Stream in your application will have assigned 3 partitions per topic by default (0, 1 and 2) during test environment

With the test client you can:

  • Send events so you won't need to mock the producer
  • Call the consumer code, then the client will make sure that all the events are consumed before leaving the async context
"},{"location":"test_client/#using-teststreamclient","title":"Using TestStreamClient","text":"

Import TestStreamClient.

Create a TestStreamClient by passing the engine instance to it.

Create functions with a name that starts with test_ (this is standard pytest conventions).

Use the TestStreamClient object the same way as you do with engine.

Write simple assert statements with the standard Python expressions that you need to check (again, standard pytest).

"},{"location":"test_client/#example","title":"Example","text":"

Let's assume that you have the following code example. The goal is to store all the consumed events in an EventStore for future analysis.

# example.py\nimport aiorun\nimport typing\nfrom dataclasses import dataclass, field\n\nfrom kstreams import ConsumerRecord, create_engine\nfrom kstreams.streams import Stream\n\ntopic = \"local--kstreams\"\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@dataclass\nclass EventStore:\n    \"\"\"\n    Store events in memory\n    \"\"\"\n    events: typing.List[ConsumerRecord] = field(default_factory=list)\n\n    def add(self, event: ConsumerRecord) -> None:\n        self.events.append(event)\n\n    @property\n    def total(self):\n        return len(self.events)\n\n\nevent_store = EventStore()\n\n\n@stream_engine.stream(topic, group_id=\"example-group\")\nasync def consume(cr: ConsumerRecord):\n    event_store.add(cr)\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for _ in range(5):\n        await stream_engine.send(topic, value=payload, key=\"1\")\n        await asyncio.sleep(2)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\ndef main():\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

Then you could have a test_stream.py file to test the code, you need to instanciate the TestStreamClient with the engine:

# test_stream.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom example import stream_engine, event_store\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_add_event_on_consume():\n    \"\"\"\n    Produce some events and check that the EventStore is updated.\n    \"\"\"\n    topic = \"local--kstreams\"  # Use the same topic as the stream\n    event = b'{\"message\": \"Hello world!\"}'\n\n    async with client:\n        metadata = await client.send(topic, value=event, key=\"1\")  # send the event with the test client\n        current_offset = metadata.offset\n        assert metadata.topic == topic\n\n        # send another event and check that the offset was incremented\n        metadata = await client.send(topic, value=b'{\"message\": \"Hello world!\"}', key=\"1\")\n        assert metadata.offset == current_offset + 1\n\n    # check that the event_store has 2 events stored\n    assert event_store.total == 2\n

Note

Notice that the produce coroutine is not used to send events in the test case. The TestStreamClient.send coroutine is used instead. This allows to test streams without having producer code in your application

"},{"location":"test_client/#testing-the-commit","title":"Testing the Commit","text":"

In some cases your stream will commit, in this situation checking the commited partitions can be useful.

import pytest\nfrom kstreams.test_utils import TestStreamClient\nfrom kstreams import ConsumerRecord, Stream, TopicPartition\n\nfrom .example import produce, stream_engine\n\ntopic_name = \"local--kstreams-marcos\"\nvalue = b'{\"message\": \"Hello world!\"}'\nname = \"my-stream\"\nkey = \"1\"\npartition = 2\ntp = TopicPartition(\n    topic=topic_name,\n    partition=partition,\n)\ntotal_events = 10\n\n@stream_engine.stream(topic_name, name=name)\nasync def my_stream(cr: ConsumerRecord, stream: Stream):\n    # commit every time that an event arrives\n    await stream.commit({tp: cr.offset})\n\n\n# test the code\nclient = TestStreamClient(stream_engine)\n\n@pytest.mark.asyncio\nasync def test_consumer_commit(stream_engine: StreamEngine):\n    async with client:\n        for _ in range(0, total_events):\n            await client.send(topic_name, partition=partition, value=value, key=key)\n\n        # check that everything was commited\n        stream = stream_engine.get_stream(name)\n        assert (await stream.committed(tp)) == total_events\n
"},{"location":"test_client/#e2e-test","title":"E2E test","text":"

In the previous code example the application produces to and consumes from the same topic, then TestStreamClient.send is not needed because the engine.send is producing. For those situation you can just use your producer code and check that certain code was called.

# test_example.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .example import produce, stream_engine\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_e2e_example():\n    \"\"\"\n    Test that events are produce by the engine and consumed by the streams\n    \"\"\"\n    with patch(\"example.on_consume\") as on_consume, patch(\"example.on_produce\") as on_produce:\n        async with client:\n            await produce()\n\n    on_produce.call_count == 5\n    on_consume.call_count == 5\n
"},{"location":"test_client/#producer-only","title":"Producer only","text":"

In some scenarios, your application will only produce events and other application/s will consume it, but you want to make sure that the event was procuced in a proper way and the topic contains that event.

# producer_example.py\nfrom kstreams import create_engine\nimport aiorun\nimport asyncio\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\nasync def produce(topic: str, value: bytes, key: str):\n    # This could be a complicated function or something like a FastAPI view\n    await stream_engine.send(topic, value=value, key=key)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\ndef main():\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

Then you could have a test_producer_example.py file to test the code:

# test_producer_example.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom producer_example import stream_engine, produce\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_event_produced():\n    topic_name = \"local--kstreams\"\n    value = b'{\"message\": \"Hello world!\"}'\n    key = \"1\"\n\n    async with client:\n        await produce(topic=topic_name ,value=value, key=key) # use the produce code to send events\n\n        # check that the event was placed in a topic in a proper way\n        consumer_record = await client.get_event(topic_name=topic_name)\n\n        assert consumer_record.value == value\n        assert consumer_record.key == key\n

Note

Even thought the previous example is using a simple produce function, it shows what to do when the procuder code is encapsulated in other functions, for example a FastAPI view. Then you don't want to use client.send directly, just called the function that contains stream_engine.send(...)

"},{"location":"test_client/#defining-extra-topics","title":"Defining extra topics","text":"

For some uses cases is required to produce an event to a topic (target topic) after it was consumed (source topic). We are in control of the source topic because it has a stream associated with it and we want to consume events from it, however we might not be in control of the target topic.

How can we consume an event from the target topic which has not a stream associated and the topic will be created only when a send is reached? The answer is to pre define the extra topics before the test cycle has started. Let's take a look an example:

Let's imagine that we have the following code:

from kstreams import ConsumerRecord\n\nfrom .engine import stream_engine\n\n\n@stream_engine.stream(\"source-topic\", name=name)\nasync def consume(cr: ConsumerRecord) -> None:\n    # do something, for example save to db\n    await save_to_db(cr)\n\n    # then produce the event to the `target topic`\n    await stream_engine.send(\"target-topic\", value=cr.value, key=cr.key, headers=cr.headers)\n

Here we can test two things:

  1. Sending an event to the source-topic and check that the event has been consumed and saved to the DB
  2. Check that the event was send to the target-topic

Testing point 1 is straightforward:

import pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .engine import stream_engine\n\n\nclient = TestStreamClient(stream_engine)\nvalue = b'{\"message\": \"Hello world!\"}'\nkey = \"my-key\"\n\nasync with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    # check that the event was saved to the DB\n    assert await db.get(...)\n

However to test the point 2 we need more effort as the TestStreamClient is not aware of the target topic until it reaches the send inside the consume coroutine. If we try to get the target topic event inside the async with context we will have an error:

async with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    ...\n    # Let's check if it was received by the target topic\n    event = await client.get_event(topic_name=\"target-topic\")\n\n\nValueError: You might be trying to get the topic target-topic outside the `client async context` or trying to get an event from an empty topic target-topic. Make sure that the code is inside the async contextand the topic has events.\n

We can solve this with a delay (await asyncio.sleep(...)) inside the async with context to give time to the TestStreamClient to create the topic, however if the buisness logic inside the consume is slow we need to add more delay, then it will become a race condition.

To proper solve it, we can specify to the TestStreamClient the extra topics that we need during the test cycle.

import pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .engine import stream_engine\n\n\n# tell the client to create the extra topics\nclient = TestStreamClient(stream_engine, topics=[\"target-topic\"])\nvalue = b'{\"message\": \"Hello world!\"}'\nkey = \"my-key\"\n\nasync with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    # check that the event was saved to the DB\n    assert await db.get(...)\n\n    # Let's check if it was received by the target topic\n    event = await client.get_event(topic_name=\"target-topic\")\n    assert event.value == value\n    assert event.key == key\n
"},{"location":"test_client/#topics-subscribed-by-pattern","title":"Topics subscribed by pattern","text":"

When a Stream is using pattern subscription it is not possible to know before hand how many topics the Stream will consume from. To solve this problem the topics must be pre defined using the extra topics features from the TestClient:

In the following example we have a Stream that will consume from topics that match the regular expression ^dev--customer-.*$, for example dev--customer-invoice and dev--customer-profile.

# app.py\nfrom kstreams import ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(topics=\"^dev--customer-.*$\", subscribe_by_pattern=True)\nasync def stream(cr: ConsumerRecord):\n    if cr.topic == customer_invoice_topic:\n        assert cr.value == invoice_event\n    elif cr.topic == customer_profile_topic:\n        assert cr.value == profile_event\n    else:\n        raise ValueError(f\"Invalid topic {cr.topic}\")\n

Then to test our Stream, we need to pre define the topics:

# test_stream.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom app import stream_engine\n\n\n@pytest.mark.asyncio\nasync def test_consume_events_topics_by_pattern():\n    \"\"\"\n    This test shows the possibility to subscribe to multiple topics using a pattern\n    \"\"\"\n    customer_invoice_topic = \"dev--customer-invoice\"\n    customer_profile_topic = \"dev--customer-profile\"\n\n    client = TestStreamClient(\n        stream_engine, topics=[customer_invoice_topic, customer_profile_topic]\n    )\n\n    async with client:\n        await client.send(customer_invoice_topic, value=b\"invoice-1\", key=\"1\")\n        await client.send(customer_profile_topic, value=b\"profile-1\", key=\"1\")\n\n        # give some time to consume all the events\n        await asyncio.sleep(0.1)\n        assert TopicManager.all_messages_consumed()\n
"},{"location":"test_client/#disabling-monitoring-during-testing","title":"Disabling monitoring during testing","text":"

Monitoring streams and producers is vital for streaming application but it requires extra effort. Sometimes during testing, monitoring is not required as we only want to focus on testing the buisness logic. In order to disable monitoring during testing use:

client = TestStreamClient(stream_engine, monitoring_enabled=False)\n
"},{"location":"utils/","title":"Utils","text":"

Utility functions

"},{"location":"utils/#kstreams.utils","title":"kstreams.utils","text":""},{"location":"utils/#kstreams.utils.create_ssl_context","title":"create_ssl_context(*, cafile=None, capath=None, cadata=None, certfile=None, keyfile=None, password=None, crlfile=None)","text":"

Wrapper of aiokafka.helpers.create_ssl_context with typehints.

Parameters:

Name Type Description Default cafile Optional[str]

Certificate Authority file path containing certificates used to sign broker certificates

None capath Optional[str]

Same as cafile, but points to a directory containing several CA certificates

None cadata Union[str, bytes, None]

Same as cafile, but instead contains already read data in either ASCII or bytes format

None certfile Optional[str]

optional filename of file in PEM format containing the client certificate, as well as any CA certificates needed to establish the certificate's authenticity

None keyfile Optional[str]

optional filename containing the client private key.

None password Optional[str]

optional password to be used when loading the certificate chain

None Source code in kstreams/utils.py
def create_ssl_context(\n    *,\n    cafile: Optional[str] = None,\n    capath: Optional[str] = None,\n    cadata: Union[str, bytes, None] = None,\n    certfile: Optional[str] = None,\n    keyfile: Optional[str] = None,\n    password: Optional[str] = None,\n    crlfile: Any = None,\n):\n    \"\"\"Wrapper of [aiokafka.helpers.create_ssl_context](\n        https://aiokafka.readthedocs.io/en/stable/api.html#helpers\n    )\n    with typehints.\n\n    Arguments:\n        cafile: Certificate Authority file path containing certificates\n            used to sign broker certificates\n        capath: Same as `cafile`, but points to a directory containing\n            several CA certificates\n        cadata: Same as `cafile`, but instead contains already\n            read data in either ASCII or bytes format\n        certfile: optional filename of file in PEM format containing\n            the client certificate, as well as any CA certificates needed to\n            establish the certificate's authenticity\n        keyfile: optional filename containing the client private key.\n        password: optional password to be used when loading the\n            certificate chain\n\n    \"\"\"\n    return aiokafka_create_ssl_context(\n        cafile=cafile,\n        capath=capath,\n        cadata=cadata,\n        certfile=certfile,\n        keyfile=keyfile,\n        password=password,\n        crlfile=crlfile,\n    )\n
"},{"location":"utils/#kstreams.utils.create_ssl_context_from_mem","title":"create_ssl_context_from_mem(*, certdata, keydata, password=None, cadata=None)","text":"

Create a SSL context from data on memory.

This makes it easy to read the certificates from environmental variables Usually the data is loaded from env variables.

Parameters:

Name Type Description Default cadata Optional[str]

certificates used to sign broker certificates provided as unicode str

None certdata str

the client certificate, as well as any CA certificates needed to establish the certificate's authenticity provided as unicode str

required keydata str

the client private key provided as unicode str

required password Optional[str]

optional password to be used when loading the certificate chain

None Source code in kstreams/utils.py
def create_ssl_context_from_mem(\n    *,\n    certdata: str,\n    keydata: str,\n    password: Optional[str] = None,\n    cadata: Optional[str] = None,\n) -> Optional[ssl.SSLContext]:\n    \"\"\"Create a SSL context from data on memory.\n\n    This makes it easy to read the certificates from environmental variables\n    Usually the data is loaded from env variables.\n\n    Arguments:\n        cadata: certificates used to sign broker certificates provided as unicode str\n        certdata: the client certificate, as well as any CA certificates needed to\n            establish the certificate's authenticity provided as unicode str\n        keydata: the client private key provided as unicode str\n        password: optional password to be used when loading the\n            certificate chain\n    \"\"\"\n    with contextlib.ExitStack() as stack:\n        cert_file = stack.enter_context(NamedTemporaryFile(suffix=\".crt\"))\n        key_file = stack.enter_context(NamedTemporaryFile(suffix=\".key\"))\n\n        # expecting unicode data, writing it as bytes to files as utf-8\n        cert_file.write(certdata.encode(\"utf-8\"))\n        cert_file.flush()\n\n        key_file.write(keydata.encode(\"utf-8\"))\n        key_file.flush()\n\n        ssl_context = ssl.create_default_context(cadata=cadata)\n        ssl_context.load_cert_chain(\n            cert_file.name, keyfile=key_file.name, password=password\n        )\n        return ssl_context\n    return None\n
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Kstreams","text":"

kstreams is a library/micro framework to use with kafka. It has simple kafka streams implementation that gives certain guarantees, see below.

"},{"location":"#requirements","title":"Requirements","text":"

python 3.8+

"},{"location":"#installation","title":"Installation","text":"
pip install kstreams\n

You will need a worker, we recommend aiorun

pip install aiorun\n
"},{"location":"#usage","title":"Usage","text":"
import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n@stream_engine.stream(\"local--kstream\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--kstreams\", value=payload)\n        print(f\"Message sent: {metadata}\")\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n
"},{"location":"#kafka-configuration","title":"Kafka configuration","text":"

Configure kafka using the kafka backend provided.

"},{"location":"#development","title":"Development","text":"

This repo requires the use of poetry instead of pip. Note: If you want to have the virtualenv in the same path as the project first you should run poetry config --local virtualenvs.in-project true

To install the dependencies just execute:

poetry install\n

Then you can activate the virtualenv with

poetry shell\n

Run test:

./scripts/test\n

Run code linting (black and isort)

./scripts/lint\n
"},{"location":"#commit-messages","title":"Commit messages","text":"

The use of commitizen is recommended. Commitizen is part of the dev dependencies.

cz commit\n
"},{"location":"backends/","title":"Backends","text":"

The main idea of a backend is to supply the necessary configuration to create a connection with the backend.

kstreams currently has support for Kafka as a backend.

"},{"location":"backends/#kstreams.backends.kafka.Kafka","title":"kstreams.backends.kafka.Kafka","text":"

The Kafka backend validates the given attributes.

It uses pydantic internally.

Attributes:

Name Type Description bootstrap_servers List[str]

kafka list of hostname:port

security_protocol SecurityProtocol

Protocol used to communicate with brokers

ssl_context Optional[SSLContext]

a python std ssl.SSLContext instance, you can generate it with create_ssl_context or create_ssl_context_from_mem

sasl_mechanism SaslMechanism

Authentication mechanism when security_protocol is configured for SASL_PLAINTEXT or SASL_SSL

sasl_plain_username Optional[str]

username for sasl PLAIN authentication

sasl_plain_password Optional[str]

password for sasl PLAIN authentication

sasl_oauth_token_provider Optional[str]

smth

Raises:

Type Description ValidationError

a pydantic.ValidationError exception

"},{"location":"backends/#kstreams.backends.kafka.Kafka--plaintext","title":"PLAINTEXT","text":"

Example

from kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, Stream\n\nbackend = Kafka(bootstrap_servers=[\"localhost:9092\"])\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n
"},{"location":"backends/#kstreams.backends.kafka.Kafka--ssl","title":"SSL","text":"

Example

Create SSL context
import ssl\n\nfrom kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, utils, Stream\n\n\ndef get_ssl_context() -> ssl.SSLContext:\n    return utils.create_ssl_context(\n        cafile=\"certificate-authority-file-path\",\n        capath=\"points-to-directory-with-several-ca-certificates\",\n        cadata=\"same-as-cafile-but-ASCII-or-bytes-format\",\n        certfile=\"client-certificate-file-name\",\n        keyfile=\"client-private-key-file-name\",\n        password=\"password-to-load-certificate-chain\",\n    )\n\nbackend = Kafka(\n    bootstrap_servers=[\"localhost:9094\"],\n    security_protocol=\"SSL\",\n    ssl_context=get_ssl_context(),\n)\n\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n

Note

Check create ssl context util

Example

Create SSL context from memory
import ssl\n\nfrom kstreams.backends.kafka import Kafka\nfrom kstreams import create_engine, utils, Stream\n\n\ndef get_ssl_context() -> ssl.SSLContext:\n    return utils.create_ssl_context_from_mem(\n        cadata=\"ca-certificates-as-unicode\",\n        certdata=\"client-certificate-as-unicode\",\n        keydata=\"client-private-key-as-unicode\",\n        password=\"optional-password-to-load-certificate-chain\",\n    )\n\nbackend = Kafka(\n    bootstrap_servers=[\"localhost:9094\"],\n    security_protocol=\"SSL\",\n    ssl_context=get_ssl_context(),\n)\n\nstream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n

Note

Check create ssl context from memerory util

Source code in kstreams/backends/kafka.py
class Kafka(BaseModel):\n    \"\"\"\n    The `Kafka` backend validates the given attributes.\n\n    It uses pydantic internally.\n\n    Attributes:\n        bootstrap_servers: kafka list of `hostname:port`\n        security_protocol: Protocol used to communicate with brokers\n        ssl_context: a python std `ssl.SSLContext` instance, you can generate\n            it with `create_ssl_context`\n            or `create_ssl_context_from_mem`\n        sasl_mechanism: Authentication mechanism when `security_protocol` is configured\n            for `SASL_PLAINTEXT` or `SASL_SSL`\n        sasl_plain_username: username for sasl PLAIN authentication\n        sasl_plain_password: password for sasl PLAIN authentication\n        sasl_oauth_token_provider: smth\n\n    Raises:\n        ValidationError: a `pydantic.ValidationError` exception\n\n    ## PLAINTEXT\n\n    !!! Example\n        ```python\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, Stream\n\n        backend = Kafka(bootstrap_servers=[\"localhost:9092\"])\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n    ## SSL\n\n    !!! Example\n        ```python title=\"Create SSL context\"\n        import ssl\n\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, utils, Stream\n\n\n        def get_ssl_context() -> ssl.SSLContext:\n            return utils.create_ssl_context(\n                cafile=\"certificate-authority-file-path\",\n                capath=\"points-to-directory-with-several-ca-certificates\",\n                cadata=\"same-as-cafile-but-ASCII-or-bytes-format\",\n                certfile=\"client-certificate-file-name\",\n                keyfile=\"client-private-key-file-name\",\n                password=\"password-to-load-certificate-chain\",\n            )\n\n        backend = Kafka(\n            bootstrap_servers=[\"localhost:9094\"],\n            security_protocol=\"SSL\",\n            ssl_context=get_ssl_context(),\n        )\n\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n        !!! note\n            Check [create ssl context util](https://kpn.github.io/kstreams/utils/#kstreams.utils.create_ssl_context)\n\n    !!! Example\n        ```python title=\"Create SSL context from memory\"\n        import ssl\n\n        from kstreams.backends.kafka import Kafka\n        from kstreams import create_engine, utils, Stream\n\n\n        def get_ssl_context() -> ssl.SSLContext:\n            return utils.create_ssl_context_from_mem(\n                cadata=\"ca-certificates-as-unicode\",\n                certdata=\"client-certificate-as-unicode\",\n                keydata=\"client-private-key-as-unicode\",\n                password=\"optional-password-to-load-certificate-chain\",\n            )\n\n        backend = Kafka(\n            bootstrap_servers=[\"localhost:9094\"],\n            security_protocol=\"SSL\",\n            ssl_context=get_ssl_context(),\n        )\n\n        stream_engine = create_engine(title=\"my-stream-engine\", backend=backend)\n        ```\n\n        !!! note\n            Check [create ssl context from memerory util](https://kpn.github.io/kstreams/utils/#kstreams.utils.create_ssl_context_from_mem)\n    \"\"\"\n\n    bootstrap_servers: List[str] = [\"localhost:9092\"]\n    security_protocol: SecurityProtocol = SecurityProtocol.PLAINTEXT\n\n    ssl_context: Optional[ssl.SSLContext] = None\n\n    sasl_mechanism: SaslMechanism = SaslMechanism.PLAIN\n    sasl_plain_username: Optional[str] = None\n    sasl_plain_password: Optional[str] = None\n    sasl_oauth_token_provider: Optional[str] = None\n    model_config = ConfigDict(arbitrary_types_allowed=True, use_enum_values=True)\n\n    @model_validator(mode=\"after\")\n    @classmethod\n    def protocols_validation(cls, values):\n        security_protocol = values.security_protocol\n\n        if security_protocol == SecurityProtocol.PLAINTEXT:\n            return values\n        elif security_protocol == SecurityProtocol.SSL:\n            if values.ssl_context is None:\n                raise ValueError(\"`ssl_context` is required\")\n            return values\n        elif security_protocol == SecurityProtocol.SASL_PLAINTEXT:\n            if values.sasl_mechanism is SaslMechanism.OAUTHBEARER:\n                # We don't perform a username and password check if OAUTHBEARER\n                return values\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_username is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_username` is required when using SASL_PLAIN\"\n                )\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_password is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_password` is required when using SASL_PLAIN\"\n                )\n            return values\n        elif security_protocol == SecurityProtocol.SASL_SSL:\n            if values.ssl_context is None:\n                raise ValueError(\"`ssl_context` is required\")\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_username is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_username` is required when using SASL_PLAIN\"\n                )\n            if (\n                values.sasl_mechanism is SaslMechanism.PLAIN\n                and values.sasl_plain_password is None\n            ):\n                raise ValueError(\n                    \"`sasl_plain_password` is required when using SASL_PLAIN\"\n                )\n            return values\n
"},{"location":"engine/","title":"StreamEngine","text":""},{"location":"engine/#kstreams.engine.StreamEngine","title":"kstreams.engine.StreamEngine","text":"

Attributes:

Name Type Description backend Kafka

Backend to connect. Default Kafka

consumer_class Consumer

The consumer class to use when instanciate a consumer. Default kstreams.Consumer

producer_class Producer

The producer class to use when instanciate the producer. Default kstreams.Producer

monitor PrometheusMonitor

Prometheus monitor that holds the metrics

title str | None

Engine name

serializer Serializer | None

Serializer to use when an event is produced.

deserializer Deserializer | None

Deserializer to be used when an event is consumed. If provided it will be used in all Streams instances as a general one. To override it per Stream, you can provide one per Stream

Example

Usage
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@kstreams.stream(\"local--hello-world\", group_id=\"example-group\")\nasync def consume(stream: kstreams.ConsumerRecord) -> None:\n    print(f\"showing bytes: {cr.value}\")\n\n\nawait stream_engine.start()\n
Source code in kstreams/engine.py
class StreamEngine:\n    \"\"\"\n    Attributes:\n        backend kstreams.backends.Kafka: Backend to connect. Default `Kafka`\n        consumer_class kstreams.Consumer: The consumer class to use when\n            instanciate a consumer. Default kstreams.Consumer\n        producer_class kstreams.Producer: The producer class to use when\n            instanciate the producer. Default kstreams.Producer\n        monitor kstreams.PrometheusMonitor: Prometheus monitor that holds\n            the [metrics](https://kpn.github.io/kstreams/metrics/)\n        title str | None: Engine name\n        serializer kstreams.serializers.Serializer | None: Serializer to\n            use when an event is produced.\n        deserializer kstreams.serializers.Deserializer | None: Deserializer\n            to be used when an event is consumed.\n            If provided it will be used in all Streams instances as a general one.\n            To override it per Stream, you can provide one per Stream\n\n    !!! Example\n        ```python title=\"Usage\"\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @kstreams.stream(\"local--hello-world\", group_id=\"example-group\")\n        async def consume(stream: kstreams.ConsumerRecord) -> None:\n            print(f\"showing bytes: {cr.value}\")\n\n\n        await stream_engine.start()\n        ```\n    \"\"\"\n\n    def __init__(\n        self,\n        *,\n        backend: Kafka,\n        consumer_class: typing.Type[Consumer],\n        producer_class: typing.Type[Producer],\n        monitor: PrometheusMonitor,\n        title: typing.Optional[str] = None,\n        deserializer: Deprecated[typing.Optional[Deserializer]] = None,\n        serializer: typing.Optional[Serializer] = None,\n        on_startup: typing.Optional[EngineHooks] = None,\n        on_stop: typing.Optional[EngineHooks] = None,\n        after_startup: typing.Optional[EngineHooks] = None,\n        after_stop: typing.Optional[EngineHooks] = None,\n    ) -> None:\n        self.title = title\n        self.backend = backend\n        self.consumer_class = consumer_class\n        self.producer_class = producer_class\n        self.deserializer = deserializer\n        self.serializer = serializer\n        self.monitor = monitor\n        self._producer: typing.Optional[typing.Type[Producer]] = None\n        self._streams: typing.List[Stream] = []\n        self._on_startup = [] if on_startup is None else list(on_startup)\n        self._on_stop = [] if on_stop is None else list(on_stop)\n        self._after_startup = [] if after_startup is None else list(after_startup)\n        self._after_stop = [] if after_stop is None else list(after_stop)\n\n    async def send(\n        self,\n        topic: str,\n        value: typing.Any = None,\n        key: typing.Any = None,\n        partition: typing.Optional[int] = None,\n        timestamp_ms: typing.Optional[int] = None,\n        headers: typing.Optional[Headers] = None,\n        serializer: typing.Optional[Serializer] = None,\n        serializer_kwargs: typing.Optional[typing.Dict] = None,\n    ):\n        \"\"\"\n        Attributes:\n            topic str: Topic name to send the event to\n            value Any: Event value\n            key str | None: Event key\n            partition int | None: Topic partition\n            timestamp_ms int | None: Event timestamp in miliseconds\n            headers Dict[str, str] | None: Event headers\n            serializer kstreams.serializers.Serializer | None: Serializer to\n                encode the event\n            serializer_kwargs Dict[str, Any] | None: Serializer kwargs\n        \"\"\"\n        if self._producer is None:\n            raise EngineNotStartedException()\n\n        serializer = serializer or self.serializer\n\n        # serialize only when value and serializer are present\n        if value is not None and serializer is not None:\n            value = await serializer.serialize(\n                value, headers=headers, serializer_kwargs=serializer_kwargs\n            )\n\n        encoded_headers = None\n        if headers is not None:\n            encoded_headers = encode_headers(headers)\n\n        fut = await self._producer.send(\n            topic,\n            value=value,\n            key=key,\n            partition=partition,\n            timestamp_ms=timestamp_ms,\n            headers=encoded_headers,\n        )\n        metadata: RecordMetadata = await fut\n        self.monitor.add_topic_partition_offset(\n            topic, metadata.partition, metadata.offset\n        )\n\n        return metadata\n\n    async def start(self) -> None:\n        # Execute on_startup hooks\n        await execute_hooks(self._on_startup)\n\n        # add the producer and streams to the Monitor\n        self.monitor.add_producer(self._producer)\n        self.monitor.add_streams(self._streams)\n\n        await self.start_producer()\n        await self.start_streams()\n\n        # Execute after_startup hooks\n        await execute_hooks(self._after_startup)\n\n    def on_startup(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run before the engine starts.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable before engine starts\n\n        !!! Example\n            ```python title=\"Engine before startup\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.on_startup\n            async def init_db() -> None:\n                print(\"Initializing Database Connections\")\n                await init_db()\n\n\n            @stream_engine.on_startup\n            async def start_background_task() -> None:\n                print(\"Some background task\")\n            ```\n        \"\"\"\n        self._on_startup.append(func)\n        return func\n\n    def on_stop(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run before the engine stops.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable before engine stops\n\n        !!! Example\n            ```python title=\"Engine before stops\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.on_stop\n            async def close_db() -> None:\n                print(\"Closing Database Connections\")\n                await db_close()\n            ```\n        \"\"\"\n        self._on_stop.append(func)\n        return func\n\n    def after_startup(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run after the engine starts.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable after engine starts\n\n        !!! Example\n            ```python title=\"Engine after startup\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.after_startup\n            async def after_startup() -> None:\n                print(\"Set pod as healthy\")\n                await mark_healthy_pod()\n            ```\n        \"\"\"\n        self._after_startup.append(func)\n        return func\n\n    def after_stop(\n        self,\n        func: typing.Callable[[], typing.Any],\n    ) -> typing.Callable[[], typing.Any]:\n        \"\"\"\n        A list of callables to run after the engine stops.\n        Handler are callables that do not take any arguments, and may be either\n        standard functions, or async functions.\n\n        Attributes:\n            func typing.Callable[[], typing.Any]: Func to callable after engine stops\n\n        !!! Example\n            ```python title=\"Engine after stops\"\n\n            import kstreams\n\n            stream_engine = kstreams.create_engine(\n                title=\"my-stream-engine\"\n            )\n\n            @stream_engine.after_stop\n            async def after_stop() -> None:\n                print(\"Finishing backgrpund tasks\")\n            ```\n        \"\"\"\n        self._after_stop.append(func)\n        return func\n\n    async def stop(self) -> None:\n        # Execute on_startup hooks\n        await execute_hooks(self._on_stop)\n\n        await self.monitor.stop()\n        await self.stop_producer()\n        await self.stop_streams()\n\n        # Execute after_startup hooks\n        await execute_hooks(self._after_stop)\n\n    async def stop_producer(self):\n        if self._producer is not None:\n            await self._producer.stop()\n        logger.info(\"Producer has STOPPED....\")\n\n    async def start_producer(self, **kwargs) -> None:\n        if self.producer_class is None:\n            return None\n        config = {**self.backend.model_dump(), **kwargs}\n        self._producer = self.producer_class(**config)\n        if self._producer is None:\n            return None\n        await self._producer.start()\n\n    async def start_streams(self) -> None:\n        # Only start the Streams that are not async_generators\n        streams = [\n            stream\n            for stream in self._streams\n            if not inspect.isasyncgenfunction(stream.func)\n        ]\n\n        await self._start_streams_on_background_mode(streams)\n\n    async def _start_streams_on_background_mode(\n        self, streams: typing.List[Stream]\n    ) -> None:\n        # start all the streams\n        for stream in streams:\n            asyncio.create_task(stream.start())\n\n        # start monitoring\n        asyncio.create_task(self.monitor.start())\n\n    async def stop_streams(self) -> None:\n        for stream in self._streams:\n            await stream.stop()\n        logger.info(\"Streams have STOPPED....\")\n\n    async def clean_streams(self):\n        await self.stop_streams()\n        self._streams = []\n\n    def exist_stream(self, name: str) -> bool:\n        stream = self.get_stream(name)\n        return True if stream is not None else False\n\n    def get_stream(self, name: str) -> typing.Optional[Stream]:\n        stream = next((stream for stream in self._streams if stream.name == name), None)\n\n        return stream\n\n    def add_stream(\n        self, stream: Stream, error_policy: typing.Optional[StreamErrorPolicy] = None\n    ) -> None:\n        \"\"\"\n        Add a stream to the engine.\n\n        This method registers a new stream with the engine, setting up necessary\n        configurations and handlers. If a stream with the same name already exists,\n        a DuplicateStreamException is raised.\n\n        Args:\n            stream: The stream to be added.\n            error_policy: An optional error policy to be applied to the stream.\n                You should probably set directly when instanciating a Stream, not here.\n\n        Raises:\n            DuplicateStreamException: If a stream with the same name already exists.\n\n        Notes:\n            - If the stream does not have a deserializer, the engine's deserializer\n              is assigned to it.\n            - If the stream does not have a rebalance listener, a default\n              MetricsRebalanceListener is assigned.\n            - The stream's UDF handler is set up with the provided function and\n              engine's send method.\n            - If the stream's UDF handler type is not NO_TYPING, a middleware stack\n              is built for the stream's function.\n        \"\"\"\n        if self.exist_stream(stream.name):\n            raise DuplicateStreamException(name=stream.name)\n\n        if error_policy is not None:\n            stream.error_policy = error_policy\n\n        stream.backend = self.backend\n        if stream.deserializer is None:\n            stream.deserializer = self.deserializer\n        self._streams.append(stream)\n\n        if stream.rebalance_listener is None:\n            # set the stream to the listener to it will be available\n            # when the callbacks are called\n            stream.rebalance_listener = MetricsRebalanceListener()\n\n        stream.rebalance_listener.stream = stream\n        stream.rebalance_listener.engine = self\n\n        stream.udf_handler = UdfHandler(\n            next_call=stream.func,\n            send=self.send,\n            stream=stream,\n        )\n\n        # NOTE: When `no typing` support is deprecated this check can\n        # be removed\n        if stream.udf_handler.type != UDFType.NO_TYPING:\n            stream.func = self._build_stream_middleware_stack(stream=stream)\n\n    def _build_stream_middleware_stack(self, *, stream: Stream) -> NextMiddlewareCall:\n        assert stream.udf_handler, \"UdfHandler can not be None\"\n\n        middlewares = stream.get_middlewares(self)\n        next_call = stream.udf_handler\n        for middleware, options in reversed(middlewares):\n            next_call = middleware(\n                next_call=next_call, send=self.send, stream=stream, **options\n            )\n        return next_call\n\n    async def remove_stream(self, stream: Stream) -> None:\n        consumer = stream.consumer\n        self._streams.remove(stream)\n        await stream.stop()\n\n        if consumer is not None:\n            self.monitor.clean_stream_consumer_metrics(consumer=consumer)\n\n    def stream(\n        self,\n        topics: typing.Union[typing.List[str], str],\n        *,\n        name: typing.Optional[str] = None,\n        deserializer: Deprecated[typing.Optional[Deserializer]] = None,\n        initial_offsets: typing.Optional[typing.List[TopicPartitionOffset]] = None,\n        rebalance_listener: typing.Optional[RebalanceListener] = None,\n        middlewares: typing.Optional[typing.List[Middleware]] = None,\n        subscribe_by_pattern: bool = False,\n        error_policy: StreamErrorPolicy = StreamErrorPolicy.STOP,\n        **kwargs,\n    ) -> typing.Callable[[StreamFunc], Stream]:\n        def decorator(func: StreamFunc) -> Stream:\n            stream_from_func = stream_func(\n                topics,\n                name=name,\n                deserializer=deserializer,\n                initial_offsets=initial_offsets,\n                rebalance_listener=rebalance_listener,\n                middlewares=middlewares,\n                subscribe_by_pattern=subscribe_by_pattern,\n                **kwargs,\n            )(func)\n            self.add_stream(stream_from_func, error_policy=error_policy)\n\n            return stream_from_func\n\n        return decorator\n
"},{"location":"engine/#kstreams.engine.StreamEngine.send","title":"send(topic, value=None, key=None, partition=None, timestamp_ms=None, headers=None, serializer=None, serializer_kwargs=None) async","text":"

Attributes:

Name Type Description topic str

Topic name to send the event to

value Any

Event value

key str | None

Event key

partition int | None

Topic partition

timestamp_ms int | None

Event timestamp in miliseconds

headers Dict[str, str] | None

Event headers

serializer Serializer | None

Serializer to encode the event

serializer_kwargs Dict[str, Any] | None

Serializer kwargs

Source code in kstreams/engine.py
async def send(\n    self,\n    topic: str,\n    value: typing.Any = None,\n    key: typing.Any = None,\n    partition: typing.Optional[int] = None,\n    timestamp_ms: typing.Optional[int] = None,\n    headers: typing.Optional[Headers] = None,\n    serializer: typing.Optional[Serializer] = None,\n    serializer_kwargs: typing.Optional[typing.Dict] = None,\n):\n    \"\"\"\n    Attributes:\n        topic str: Topic name to send the event to\n        value Any: Event value\n        key str | None: Event key\n        partition int | None: Topic partition\n        timestamp_ms int | None: Event timestamp in miliseconds\n        headers Dict[str, str] | None: Event headers\n        serializer kstreams.serializers.Serializer | None: Serializer to\n            encode the event\n        serializer_kwargs Dict[str, Any] | None: Serializer kwargs\n    \"\"\"\n    if self._producer is None:\n        raise EngineNotStartedException()\n\n    serializer = serializer or self.serializer\n\n    # serialize only when value and serializer are present\n    if value is not None and serializer is not None:\n        value = await serializer.serialize(\n            value, headers=headers, serializer_kwargs=serializer_kwargs\n        )\n\n    encoded_headers = None\n    if headers is not None:\n        encoded_headers = encode_headers(headers)\n\n    fut = await self._producer.send(\n        topic,\n        value=value,\n        key=key,\n        partition=partition,\n        timestamp_ms=timestamp_ms,\n        headers=encoded_headers,\n    )\n    metadata: RecordMetadata = await fut\n    self.monitor.add_topic_partition_offset(\n        topic, metadata.partition, metadata.offset\n    )\n\n    return metadata\n
"},{"location":"engine/#kstreams.engine.StreamEngine.on_startup","title":"on_startup(func)","text":"

A list of callables to run before the engine starts. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable before engine starts

Example

Engine before startup
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.on_startup\nasync def init_db() -> None:\n    print(\"Initializing Database Connections\")\n    await init_db()\n\n\n@stream_engine.on_startup\nasync def start_background_task() -> None:\n    print(\"Some background task\")\n
Source code in kstreams/engine.py
def on_startup(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run before the engine starts.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable before engine starts\n\n    !!! Example\n        ```python title=\"Engine before startup\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.on_startup\n        async def init_db() -> None:\n            print(\"Initializing Database Connections\")\n            await init_db()\n\n\n        @stream_engine.on_startup\n        async def start_background_task() -> None:\n            print(\"Some background task\")\n        ```\n    \"\"\"\n    self._on_startup.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.on_stop","title":"on_stop(func)","text":"

A list of callables to run before the engine stops. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable before engine stops

Example

Engine before stops
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.on_stop\nasync def close_db() -> None:\n    print(\"Closing Database Connections\")\n    await db_close()\n
Source code in kstreams/engine.py
def on_stop(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run before the engine stops.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable before engine stops\n\n    !!! Example\n        ```python title=\"Engine before stops\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.on_stop\n        async def close_db() -> None:\n            print(\"Closing Database Connections\")\n            await db_close()\n        ```\n    \"\"\"\n    self._on_stop.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.after_startup","title":"after_startup(func)","text":"

A list of callables to run after the engine starts. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable after engine starts

Example

Engine after startup
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.after_startup\nasync def after_startup() -> None:\n    print(\"Set pod as healthy\")\n    await mark_healthy_pod()\n
Source code in kstreams/engine.py
def after_startup(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run after the engine starts.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable after engine starts\n\n    !!! Example\n        ```python title=\"Engine after startup\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.after_startup\n        async def after_startup() -> None:\n            print(\"Set pod as healthy\")\n            await mark_healthy_pod()\n        ```\n    \"\"\"\n    self._after_startup.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.after_stop","title":"after_stop(func)","text":"

A list of callables to run after the engine stops. Handler are callables that do not take any arguments, and may be either standard functions, or async functions.

Attributes:

Name Type Description func Callable[[], Any]

Func to callable after engine stops

Example

Engine after stops
import kstreams\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\"\n)\n\n@stream_engine.after_stop\nasync def after_stop() -> None:\n    print(\"Finishing backgrpund tasks\")\n
Source code in kstreams/engine.py
def after_stop(\n    self,\n    func: typing.Callable[[], typing.Any],\n) -> typing.Callable[[], typing.Any]:\n    \"\"\"\n    A list of callables to run after the engine stops.\n    Handler are callables that do not take any arguments, and may be either\n    standard functions, or async functions.\n\n    Attributes:\n        func typing.Callable[[], typing.Any]: Func to callable after engine stops\n\n    !!! Example\n        ```python title=\"Engine after stops\"\n\n        import kstreams\n\n        stream_engine = kstreams.create_engine(\n            title=\"my-stream-engine\"\n        )\n\n        @stream_engine.after_stop\n        async def after_stop() -> None:\n            print(\"Finishing backgrpund tasks\")\n        ```\n    \"\"\"\n    self._after_stop.append(func)\n    return func\n
"},{"location":"engine/#kstreams.engine.StreamEngine.add_stream","title":"add_stream(stream, error_policy=None)","text":"

Add a stream to the engine.

This method registers a new stream with the engine, setting up necessary configurations and handlers. If a stream with the same name already exists, a DuplicateStreamException is raised.

Parameters:

Name Type Description Default stream Stream

The stream to be added.

required error_policy Optional[StreamErrorPolicy]

An optional error policy to be applied to the stream. You should probably set directly when instanciating a Stream, not here.

None

Raises:

Type Description DuplicateStreamException

If a stream with the same name already exists.

Notes
  • If the stream does not have a deserializer, the engine's deserializer is assigned to it.
  • If the stream does not have a rebalance listener, a default MetricsRebalanceListener is assigned.
  • The stream's UDF handler is set up with the provided function and engine's send method.
  • If the stream's UDF handler type is not NO_TYPING, a middleware stack is built for the stream's function.
Source code in kstreams/engine.py
def add_stream(\n    self, stream: Stream, error_policy: typing.Optional[StreamErrorPolicy] = None\n) -> None:\n    \"\"\"\n    Add a stream to the engine.\n\n    This method registers a new stream with the engine, setting up necessary\n    configurations and handlers. If a stream with the same name already exists,\n    a DuplicateStreamException is raised.\n\n    Args:\n        stream: The stream to be added.\n        error_policy: An optional error policy to be applied to the stream.\n            You should probably set directly when instanciating a Stream, not here.\n\n    Raises:\n        DuplicateStreamException: If a stream with the same name already exists.\n\n    Notes:\n        - If the stream does not have a deserializer, the engine's deserializer\n          is assigned to it.\n        - If the stream does not have a rebalance listener, a default\n          MetricsRebalanceListener is assigned.\n        - The stream's UDF handler is set up with the provided function and\n          engine's send method.\n        - If the stream's UDF handler type is not NO_TYPING, a middleware stack\n          is built for the stream's function.\n    \"\"\"\n    if self.exist_stream(stream.name):\n        raise DuplicateStreamException(name=stream.name)\n\n    if error_policy is not None:\n        stream.error_policy = error_policy\n\n    stream.backend = self.backend\n    if stream.deserializer is None:\n        stream.deserializer = self.deserializer\n    self._streams.append(stream)\n\n    if stream.rebalance_listener is None:\n        # set the stream to the listener to it will be available\n        # when the callbacks are called\n        stream.rebalance_listener = MetricsRebalanceListener()\n\n    stream.rebalance_listener.stream = stream\n    stream.rebalance_listener.engine = self\n\n    stream.udf_handler = UdfHandler(\n        next_call=stream.func,\n        send=self.send,\n        stream=stream,\n    )\n\n    # NOTE: When `no typing` support is deprecated this check can\n    # be removed\n    if stream.udf_handler.type != UDFType.NO_TYPING:\n        stream.func = self._build_stream_middleware_stack(stream=stream)\n
"},{"location":"getting_started/","title":"Getting Started","text":"

You can starting using kstreams with simple producers and consumers and/or integrated it with any async framework like FastAPI

"},{"location":"getting_started/#simple-consumer-and-producer","title":"Simple consumer and producer","text":"Simple use case
import asyncio\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--py-stream\", group_id=\"de-my-partition\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--py-streams\", value=payload, key=\"1\")\n        print(f\"Message sent: {metadata}\")\n        await asyncio.sleep(5)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown():\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    loop = asyncio.get_event_loop()\n    try:\n        loop.run_until_complete(start())\n        loop.run_forever()\n    finally:\n        loop.run_until_complete(shutdown())\n        loop.close()\n

(This script is complete, it should run \"as is\")

"},{"location":"getting_started/#recommended-usage","title":"Recommended usage","text":"

In the previous example you can see some boiler plate regarding how to start the program. We recommend to use aiorun, so you want have to worry about set signal handlers, shutdown callbacks, graceful shutdown and close the event loop.

Usage with aiorun
import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--py-stream\", group_id=\"de-my-partition\")\nasync def consume(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {value}\")\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for i in range(5):\n        metadata = await stream_engine.send(\"local--py-streams\", value=payload, key=\"1\")\n        print(f\"Message sent: {metadata}\")\n        await asyncio.sleep(5)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

(This script is complete, it should run \"as is\")

"},{"location":"getting_started/#fastapi","title":"FastAPI","text":"

The following code example shows how kstreams can be integrated with any async framework like FastAPI. The full example can be found here

First, we need to create an engine:

Create the StreamEngine
# streaming.engine.py\nfrom kstreams import create_engine\n\nstream_engine = create_engine(\n    title=\"my-stream-engine\",\n)\n

Define the streams:

Application stream
# streaming.streams.py\nfrom .engine import stream_engine\nfrom kstreams import ConsumerRecord\n\n\n@stream_engine.stream(\"local--kstream\")\nasync def stream(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.payload}\")\n

Create the FastAPI:

FastAPI
# app.py\nfrom fastapi import FastAPI\nfrom starlette.responses import Response\nfrom starlette_prometheus import PrometheusMiddleware, metrics\n\nfrom .streaming.streams import stream_engine\n\napp = FastAPI()\n\n@app.on_event(\"startup\")\nasync def startup_event():\n    await stream_engine.start()\n\n@app.on_event(\"shutdown\")\nasync def shutdown_event():\n    await stream_engine.stop()\n\n\n@app.get(\"/events\")\nasync def post_produce_event() -> Response:\n    payload = '{\"message\": \"hello world!\"}'\n\n    metadata = await stream_engine.send(\n        \"local--kstream\",\n        value=payload.encode(),\n    )\n    msg = (\n        f\"Produced event on topic: {metadata.topic}, \"\n        f\"part: {metadata.partition}, offset: {metadata.offset}\"\n    )\n\n    return Response(msg)\n\n\napp.add_middleware(PrometheusMiddleware, filter_unhandled_paths=True)\napp.add_api_route(\"/metrics\", metrics)\n
"},{"location":"getting_started/#changing-kafka-settings","title":"Changing Kafka settings","text":"

To modify the settings of a cluster, like the servers, refer to the backends docs

"},{"location":"large_project_structure/","title":"Large Projects","text":"

If you have a large project with maybe multiple streams we recommend the following project structure:

\u251c\u2500\u2500 my-project\n\u2502   \u251c\u2500\u2500 my_project\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 __init__.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 app.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 resources.py\n\u2502   \u2502\u00a0\u00a0 \u251c\u2500\u2500 streams.py\n\u2502   \u2502\u00a0\u00a0 \u2514\u2500\u2500 streams_roster.py\n\u2502   \u2502\u2500\u2500 tests\n\u2502   \u2502   \u251c\u2500\u2500 __init__.py\n\u2502   \u2502   \u251c\u2500\u2500 conftest.py\n\u2502   \u2502\u2500\u2500 pyproject.toml\n\u2502   \u2502\u2500\u2500 README.md\n
  • The file my_project/resouces.py contains the creation of the StreamEngine
  • The file my_project/app.py contains the entrypoint of your program
  • The file my_project/streams.py contains all the Streams

A full project example ready to use can be found here

Note

This is just a recommendation, there are many ways to structure your project

"},{"location":"large_project_structure/#resources","title":"Resources","text":"

This python module contains any global resource that will be used later in the application, for example DB connections or the StreamEngine. Typically we will have the following:

from kstreams import backends, create_engine\n\nbackend = backends.Kafka(\n    bootstrap_servers=[\"localhost:9092\"],\n    security_protocol=backends.kafka.SecurityProtocol.PLAINTEXT,\n)\n\nstream_engine = kstreams.create_engine(\n    title=\"my-stream-engine\",\n    backend=backend,\n)\n

Then later stream_engine can be reused to start the application.

"},{"location":"large_project_structure/#streams","title":"Streams","text":"

When starting your project you can have N number of Streams with its handler, let's say in streams.py module. All of the Streams will run next to each other and because they are in the same project it is really easy to share common code. However, this comes with a downside of scalability as it is not possible to take the advantages of kafka and scale up Streams individually. In next versions the StreamEngine will be able to select which Stream/s should run to mitigate this issue. Typically, your streams.py will look like:

from kstreams import Stream\n\nfrom .streams_roster import stream_roster, stream_two_roster\n\n\nmy_stream = Stream(\n    \"local--hello-world\",\n    func=stream_roster,\n    config={\n        \"group_id\": \"example-group\",\n    },\n    ...\n)\n\nmy_second_stream = Stream(\n    \"local--hello-world-2\",\n    func=stream_two_roster,\n    config={\n        \"group_id\": \"example-group-2\",\n    },\n    ...\n)\n\n...\n

and streams_roster.py contains all the coroutines that will be executed when an event arrives

import logging\n\nfrom kstreams import ConsumerRecord, Send, Stream\n\nlogger = logging.getLogger(__name__)\n\n\nasync def stream_roster(cr: ConsumerRecord, send: Send) -> None:\n    logger.info(f\"showing bytes: {cr.value}\")\n    value = f\"Event confirmed. {cr.value}\"\n\n    await send(\n        \"another-topic-to-wink\",\n        value=value.encode(),\n        key=\"1\",\n    )\n\n\nasync def stream_two_roster(cr: ConsumerRecord, send: Send, stream: Stream) -> None:\n    ...\n

It is worth to note three things:

  • We separate the Stream with its coroutine to be able to test the business logic easily
  • If you need to produce events inside a Stream add the send coroutine using dependency-injection
  • We are not using StreamEngine at all to avoid circular import errors
"},{"location":"large_project_structure/#application","title":"Application","text":"

The entrypoint is usually in app.py. The module contains the import of stream_engine, it's hooks and the streams to be added to the engine:

import aiorun\nimport asyncio\nimport logging\n\nfrom kstreams.stream_utils import StreamErrorPolicy\n\nfrom .resources import stream_engine\nfrom .streams import my_stream, my_second_stream\n\nlogger = logging.getLogger(__name__)\n\n\n# hooks\n@stream_engine.after_startup\nasync def init_events():\n    await stream_engine.send(\"local--hello-world\", value=\"Hi Kstreams!\")\n\n\n# add the stream to the stream_engine\nstream_engine.add_stream(my_stream, error_policy=StreamErrorPolicy.RESTART)\nstream_engine.add_stream(my_second_stream, error_policy=StreamErrorPolicy.STOP_ENGINE)\n\n\nasync def start():\n    await stream_engine.start()\n\n\nasync def stop(loop: asyncio.AbstractEventLoop):\n    await stream_engine.stop()\n\n\ndef main():\n    logging.basicConfig(level=logging.INFO)\n    logger.info(\"Starting application...\")\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=stop)\n

To run it we recommend aiorun. It can be also run with asyncio directly but aiorun does all the boilerplate for us.

"},{"location":"large_project_structure/#tests","title":"Tests","text":"

In this module you test your application using the TestStreamClient, usually provided as a fixture thanks to pytest. The package pytest-asyncio is also needed to test async code.

# conftest.py\nimport pytest\n\nfrom kstreams.test_utils import TestStreamClient\n\nfrom my_project.resources import stream_engine\n\n\n@pytest.fixture\ndef stream_client():\n    return TestStreamClient(stream_engine=stream_engine)\n

then you can test your streams

# test_app.py\nimport pytest\n\n\n@pytest.mark.asyncio\nasync def test_my_stream(stream_client):\n    topic = \"local--hello-world\"  # Use the same topic as the stream\n    event = b'{\"message\": \"Hello world!\"}'\n\n    async with stream_client:\n        metadata = await stream_client.send(topic, value=event, key=\"1\")\n        assert metadata.topic == topic\n
"},{"location":"metrics/","title":"Metrics","text":"

Metrics are generated by prometheus_client. You must be responsable of setting up a webserver to expose the metrics.

"},{"location":"metrics/#metrics","title":"Metrics","text":""},{"location":"metrics/#producer","title":"Producer","text":"
  • topic_partition_offsets: Gauge of offsets per topic/partition
"},{"location":"metrics/#consumer","title":"Consumer","text":"
  • consumer_committed: Gauge of consumer commited per topic/partition in a consumer group
  • consumer_position: Gauge of consumer current position per topic/partition in a consumer group
  • consumer_highwater: Gauge of consumer highwater per topic/partition in a consumer group
  • consumer_lag: Gauge of current consumer lag per topic/partition in a consumer group calculated with the last commited offset
  • position_lag: Gauge of current consumer position_lag per topic/partition in a consumer group calculated using the consumer position
"},{"location":"middleware/","title":"Middleware","text":"

Kstreams allows you to include middlewares for adding behavior to streams.

A middleware is a callable that works with every ConsumerRecord (CR) before and after it is processed by a specific stream. Middlewares also have access to the stream and send function.

  • It takes each CR that arrives to a kafka topic.
  • Then it can do something to the CR or run any needed code.
  • Then it passes the CR to be processed by another callable (other middleware or stream).
  • Once the CR is processed by the stream, the chain is \"completed\".
  • If there is code after the self.next_call(cr) then it will be executed.

Kstreams Middleware have the following protocol:

Bases: Protocol

Source code in kstreams/middleware/middleware.py
class MiddlewareProtocol(typing.Protocol):\n    next_call: types.NextMiddlewareCall\n    send: types.Send\n    stream: \"Stream\"\n\n    def __init__(\n        self,\n        *,\n        next_call: types.NextMiddlewareCall,\n        send: types.Send,\n        stream: \"Stream\",\n        **kwargs: typing.Any,\n    ) -> None: ...  #  pragma: no cover\n\n    async def __call__(\n        self, cr: types.ConsumerRecord\n    ) -> typing.Any: ...  #  pragma: no cover\n

Note

The __call__ method can return anything so previous calls can use the returned value. Make sure that the line return await self.next_call(cr) is in your method

Warning

Middlewares only work with the new Dependency Injection approach

"},{"location":"middleware/#creating-a-middleware","title":"Creating a middleware","text":"

To create a middleware you have to create a class that inherits from BaseMiddleware. Then, the method async def __call__ must be defined. Let's consider that we want to save the CR to elastic before it is processed:

import typing\n\nfrom kstreams import ConsumerRecord, middleware\n\nasync def save_to_elastic(cr: ConsumerRecord) -> None:\n    ...\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord) -> typing.Any:\n        # save to elastic before calling the next\n        await save_to_elastic(cr)\n\n        # the next call could be another middleware\n        return await self.next_call(cr)\n

Then, we have to include the middleware:

from kstreams import ConsumerRecord, middleware\n\nfrom .engine import stream_engine\n\n\nmiddlewares = [middleware.Middleware(ElasticMiddleware)]\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\n    async def processor(cr: ConsumerRecord):\n        ...\n

Note

The Middleware concept also applies for async generators (yield from a stream)

"},{"location":"middleware/#adding-extra-configuration-to-middlewares","title":"Adding extra configuration to middlewares","text":"

If you want to provide extra configuration to middleware you should override the init method with the extra options as keywargs and then call super().__init__(**kwargs)

Let's consider that we want to send an event to a spcific topic when a ValueError is raised inside a stream (Dead Letter Queue)

from kstreams import ConsumerRecord, types, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    def __init__(self, *, topic: str, **kwargs) -> None:\n        super().__init__(**kwargs)\n        self.topic = topic\n\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await self.send(self.topic, key=cr.key, value=cr.value)\n\n\n# Create the middlewares\nmiddlewares = [\n    middleware.Middleware(\n        DLQMiddleware, topic=\"kstreams-dlq-topic\"\n    )\n]\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\n    async def processor(cr: ConsumerRecord):\n        if cr.value == b\"joker\":\n            raise ValueError(\"Joker received...\")\n
"},{"location":"middleware/#default-middleware","title":"Default Middleware","text":"

This is always the first Middleware in the middleware stack to catch any exception that might occur. Any exception raised when consuming events that is not handled by the end user will be handled by this ExceptionMiddleware executing the policy_error that was stablished.

Source code in kstreams/middleware/middleware.py
class ExceptionMiddleware(BaseMiddleware):\n    \"\"\"\n    This is always the first Middleware in the middleware stack\n    to catch any exception that might occur. Any exception raised\n    when consuming events that is not handled by the end user\n    will be handled by this ExceptionMiddleware executing the\n    policy_error that was stablished.\n    \"\"\"\n\n    def __init__(\n        self, *, engine: \"StreamEngine\", error_policy: StreamErrorPolicy, **kwargs\n    ) -> None:\n        super().__init__(**kwargs)\n        self.engine = engine\n        self.error_policy = error_policy\n\n    async def __call__(self, cr: types.ConsumerRecord) -> typing.Any:\n        try:\n            return await self.next_call(cr)\n        except Exception as exc:\n            logger.exception(\n                \"Unhandled error occurred while listening to the stream. \"\n                f\"Stream consuming from topics {self.stream.topics} CRASHED!!! \\n\\n \"\n            )\n            if sys.version_info >= (3, 11):\n                exc.add_note(f\"Handler: {self.stream.func}\")\n                exc.add_note(f\"Topics: {self.stream.topics}\")\n\n            await self.cleanup_policy(exc)\n\n    async def cleanup_policy(self, exc: Exception) -> None:\n        \"\"\"\n        Execute cleanup policy according to the Stream configuration.\n\n        At this point we are inside the asyncio.Lock `is_processing`\n        as an event is being processed and an exeption has occured.\n        The Lock must be released to stop the Stream\n        (which must happen for any policy), then before re-raising\n        the exception the Lock must be acquire again to continue the processing\n\n        Exception and policies:\n\n            - STOP: The exception is re-raised as the Stream will be stopped\n              and the end user will deal with it\n\n            - STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n              (all Streams and Producer) and the end user will deal with it\n\n            - RESTART: The exception is not re-raised as the Stream\n              will recover and continue the processing. The logger.exception\n              from __call__ will record that something went wrong\n\n            - STOP_APPLICATION: The exception is not re-raised as the entire\n              application will be stopped. This is only useful when using kstreams\n              with another library like FastAPI. The logger.exception\n              from __call__ will record that something went wrong\n\n        Args:\n            exc (Exception): Any Exception that causes the Stream to crash\n\n        Raises:\n            exc: Exception is the policy is `STOP` or `STOP_ENGINE`\n        \"\"\"\n        self.stream.is_processing.release()\n\n        if self.error_policy == StreamErrorPolicy.RESTART:\n            await self.stream.stop()\n            await self.stream.start()\n        elif self.error_policy == StreamErrorPolicy.STOP:\n            await self.stream.stop()\n            # acquire `is_processing` Lock again to resume processing\n            # and avoid `RuntimeError: Lock is not acquired.`\n            await self.stream.is_processing.acquire()\n            raise exc\n        elif self.error_policy == StreamErrorPolicy.STOP_ENGINE:\n            await self.engine.stop()\n            # acquire `is_processing` Lock again to resume processing\n            # and avoid `RuntimeError: Lock is not acquired.`\n            await self.stream.is_processing.acquire()\n            raise exc\n        else:\n            # STOP_APPLICATION\n            await self.engine.stop()\n            await self.stream.is_processing.acquire()\n            signal.raise_signal(signal.SIGTERM)\n
"},{"location":"middleware/#kstreams.middleware.middleware.ExceptionMiddleware.cleanup_policy","title":"cleanup_policy(exc) async","text":"

Execute cleanup policy according to the Stream configuration.

At this point we are inside the asyncio.Lock is_processing as an event is being processed and an exeption has occured. The Lock must be released to stop the Stream (which must happen for any policy), then before re-raising the exception the Lock must be acquire again to continue the processing

Exception and policies:

- STOP: The exception is re-raised as the Stream will be stopped\n  and the end user will deal with it\n\n- STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n  (all Streams and Producer) and the end user will deal with it\n\n- RESTART: The exception is not re-raised as the Stream\n  will recover and continue the processing. The logger.exception\n  from __call__ will record that something went wrong\n\n- STOP_APPLICATION: The exception is not re-raised as the entire\n  application will be stopped. This is only useful when using kstreams\n  with another library like FastAPI. The logger.exception\n  from __call__ will record that something went wrong\n

Parameters:

Name Type Description Default exc Exception

Any Exception that causes the Stream to crash

required

Raises:

Type Description exc

Exception is the policy is STOP or STOP_ENGINE

Source code in kstreams/middleware/middleware.py
async def cleanup_policy(self, exc: Exception) -> None:\n    \"\"\"\n    Execute cleanup policy according to the Stream configuration.\n\n    At this point we are inside the asyncio.Lock `is_processing`\n    as an event is being processed and an exeption has occured.\n    The Lock must be released to stop the Stream\n    (which must happen for any policy), then before re-raising\n    the exception the Lock must be acquire again to continue the processing\n\n    Exception and policies:\n\n        - STOP: The exception is re-raised as the Stream will be stopped\n          and the end user will deal with it\n\n        - STOP_ENGINE: The exception is re-raised as the Engine will be stopped\n          (all Streams and Producer) and the end user will deal with it\n\n        - RESTART: The exception is not re-raised as the Stream\n          will recover and continue the processing. The logger.exception\n          from __call__ will record that something went wrong\n\n        - STOP_APPLICATION: The exception is not re-raised as the entire\n          application will be stopped. This is only useful when using kstreams\n          with another library like FastAPI. The logger.exception\n          from __call__ will record that something went wrong\n\n    Args:\n        exc (Exception): Any Exception that causes the Stream to crash\n\n    Raises:\n        exc: Exception is the policy is `STOP` or `STOP_ENGINE`\n    \"\"\"\n    self.stream.is_processing.release()\n\n    if self.error_policy == StreamErrorPolicy.RESTART:\n        await self.stream.stop()\n        await self.stream.start()\n    elif self.error_policy == StreamErrorPolicy.STOP:\n        await self.stream.stop()\n        # acquire `is_processing` Lock again to resume processing\n        # and avoid `RuntimeError: Lock is not acquired.`\n        await self.stream.is_processing.acquire()\n        raise exc\n    elif self.error_policy == StreamErrorPolicy.STOP_ENGINE:\n        await self.engine.stop()\n        # acquire `is_processing` Lock again to resume processing\n        # and avoid `RuntimeError: Lock is not acquired.`\n        await self.stream.is_processing.acquire()\n        raise exc\n    else:\n        # STOP_APPLICATION\n        await self.engine.stop()\n        await self.stream.is_processing.acquire()\n        signal.raise_signal(signal.SIGTERM)\n
"},{"location":"middleware/#middleware-chain","title":"Middleware chain","text":"

It is possible to add as many middlewares as you want to split and reuse business logic, however the downside is extra complexity and the code might become slower. The middleware order is important as they are evaluated in the order that were placed in the stream.

In the following example we are adding three middelwares in the following order: DLQMiddleware, ElasticMiddleware, and S3Middleware. The code chain execution will be:

sequenceDiagram\n    autonumber\n    ExceptionMiddleware->>DLQMiddleware: \n    Note left of ExceptionMiddleware: Event received\n    alt No Processing Error\n    DLQMiddleware->>ElasticMiddleware: \n    Note right of ElasticMiddleware: Store CR on Elastic\n    ElasticMiddleware->>S3Middleware: \n    Note right of S3Middleware: Store CR on S3\n    S3Middleware->>Stream: \n    Note right of Stream: CR processed\n    Stream-->>S3Middleware: \n    S3Middleware-->>ElasticMiddleware: \n    ElasticMiddleware-->>DLQMiddleware: \n    DLQMiddleware-->>ExceptionMiddleware: \n    end
Multiple middlewares example
from kstreams import ConsumerRecord, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await dlq(cr.value)\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        await save_to_elastic(cr.value)\n        return await self.next_call(cr)\n\n\nclass S3Middleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        await backup_to_s3(cr.value)\n        return await self.next_call(cr)\n\n\nmiddlewares = [\n    middleware.Middleware(DLQMiddleware),\n    middleware.Middleware(ElasticMiddleware),\n    middleware.Middleware(S3Middleware),\n]\n\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\nasync def processor(cr: ConsumerRecord):\n    if cr.value == event_2:\n        raise ValueError(\"Error from stream...\")\n    await save_to_db(cr.value)\n

Note

In the example we can see that always the cr will be save into elastic and s3 regardless an error

"},{"location":"middleware/#executing-code-after-the-cr-was-processed","title":"Executing Code after the CR was processed","text":"

As mentioned in the introduction, it is possible to execute code after the CR is handled. To do this, we need to place code after next_call is called:

Execute code after CR is handled
from kstreams import ConsumerRecord, Stream, middleware\n\n\nclass DLQMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        try:\n            return await self.next_call(cr)\n        except ValueError:\n            await dlq(cr.value)\n\n\nclass ElasticMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        return await self.next_call(cr)\n        # This will be called after the whole chain has finished\n        await save_to_elastic(cr.value)\n\n\nmiddlewares = [\n    middleware.Middleware(DLQMiddleware),\n    middleware.Middleware(ElasticMiddleware),\n]\n\n\n@stream_engine.stream(\"kstreams-topic\", middlewares=middlewares)\nasync def processor(cr: ConsumerRecord):\n    if cr.value == event_2:\n        raise ValueError(\"Error from stream...\")\n    await save_to_db(cr.value)\n

Note

In the example we can see that only if there is not an error the event is saved to elastic

"},{"location":"middleware/#deserialization","title":"Deserialization","text":"

To deserialize bytes into a different structure like dict middlewares are the preferred way to it. Examples:

Source code in examples/dataclasses-avroschema-example/dataclasses_avroschema_example/middlewares.py
class AvroDeserializerMiddleware(middleware.BaseMiddleware):\n    def __init__(self, *, model: AvroModel, **kwargs) -> None:\n        super().__init__(**kwargs)\n        self.model = model\n\n    async def __call__(self, cr: ConsumerRecord):\n        \"\"\"\n        Deserialize a payload to an AvroModel\n        \"\"\"\n        if cr.value is not None:\n            data = self.model.deserialize(cr.value)\n            cr.value = data\n        return await self.next_call(cr)\n
Source code in examples/confluent-example/confluent_example/middlewares.py
class ConfluentMiddlewareDeserializer(\n    middleware.BaseMiddleware, AsyncAvroMessageSerializer\n):\n    def __init__(\n        self,\n        *,\n        schema_registry_client: AsyncSchemaRegistryClient,\n        reader_schema: Optional[schema.AvroSchema] = None,\n        return_record_name: bool = False,\n        **kwargs,\n    ):\n        super().__init__(**kwargs)\n        self.schemaregistry_client = schema_registry_client\n        self.reader_schema = reader_schema\n        self.return_record_name = return_record_name\n        self.id_to_decoder_func: Dict = {}\n        self.id_to_writers: Dict = {}\n\n    async def __call__(self, cr: ConsumerRecord):\n        \"\"\"\n        Deserialize the event to a dict\n        \"\"\"\n        data = await self.decode_message(cr.value)\n        cr.value = data\n        return await self.next_call(cr)\n
"},{"location":"monitoring/","title":"Monitoring","text":"

This page discusses how to monitor your application using the Kafka metrics that are accessible in Prometheus.

Before we begin, it's crucial to note that Kafka itself makes a number of useful metrics available, including the cluster, broker, and clients (producer and consumers).

This means that we can quickly add some graphs to our dashboards by utilizing the already-exposed metrics.

Kstreams includes a collection of metrics. See Metrics Docs for more information.

"},{"location":"monitoring/#kstreams.PrometheusMonitor","title":"kstreams.PrometheusMonitor","text":"

Metrics monitor to keep track of Producers and Consumers.

Attributes: metrics_scrape_time float: Amount of seconds that the monitor will wait until next scrape iteration

Source code in kstreams/prometheus/monitor.py
class PrometheusMonitor:\n    \"\"\"\n    Metrics monitor to keep track of Producers and Consumers.\n\n     Attributes:\n        metrics_scrape_time float: Amount of seconds that the monitor\n            will wait until next scrape iteration\n    \"\"\"\n\n    # Producer metrics\n    MET_OFFSETS = Gauge(\n        \"topic_partition_offsets\", \"help producer offsets\", [\"topic\", \"partition\"]\n    )\n\n    # Consumer metrics\n    MET_COMMITTED = Gauge(\n        \"consumer_committed\",\n        \"help consumer committed\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_POSITION = Gauge(\n        \"consumer_position\",\n        \"help consumer position\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_HIGHWATER = Gauge(\n        \"consumer_highwater\",\n        \"help consumer highwater\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_LAG = Gauge(\n        \"consumer_lag\",\n        \"help consumer lag calculated using the last commited offset\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n    MET_POSITION_LAG = Gauge(\n        \"position_lag\",\n        \"help consumer position lag calculated using the consumer position\",\n        [\"topic\", \"partition\", \"consumer_group\"],\n    )\n\n    def __init__(self, metrics_scrape_time: float = 3):\n        self.metrics_scrape_time = metrics_scrape_time\n        self.running = False\n        self._producer = None\n        self._streams: List[Stream] = []\n\n    async def start(self) -> None:\n        self.running = True\n        logger.info(\"Starting Prometheus Monitoring started...\")\n        await self._metrics_task()\n\n    async def stop(self) -> None:\n        self.running = False\n        self._clean_consumer_metrics()\n        logger.info(\"Prometheus Monitoring stopped...\")\n\n    def add_topic_partition_offset(\n        self, topic: str, partition: int, offset: int\n    ) -> None:\n        self.MET_OFFSETS.labels(topic=topic, partition=partition).set(offset)\n\n    def _add_consumer_metrics(self, metrics_dict: MetricsType):\n        for topic_partition, partitions_metadata in metrics_dict.items():\n            group_id = partitions_metadata[\"group_id\"]\n            position = partitions_metadata[\"position\"]\n            committed = partitions_metadata[\"committed\"]\n            highwater = partitions_metadata[\"highwater\"]\n            lag = partitions_metadata[\"lag\"]\n            position_lag = partitions_metadata[\"position_lag\"]\n\n            self.MET_COMMITTED.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(committed or 0)\n            self.MET_POSITION.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(position or -1)\n            self.MET_HIGHWATER.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(highwater or 0)\n            self.MET_LAG.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(lag or 0)\n            self.MET_POSITION_LAG.labels(\n                topic=topic_partition.topic,\n                partition=topic_partition.partition,\n                consumer_group=group_id,\n            ).set(position_lag or 0)\n\n    def _clean_consumer_metrics(self) -> None:\n        \"\"\"\n        This method should be called when a rebalance takes place\n        to clean all consumers metrics. When the rebalance finishes\n        new metrics will be generated per consumer based on the\n        consumer assigments\n        \"\"\"\n        self.MET_LAG.clear()\n        self.MET_POSITION_LAG.clear()\n        self.MET_COMMITTED.clear()\n        self.MET_POSITION.clear()\n        self.MET_HIGHWATER.clear()\n\n    def clean_stream_consumer_metrics(self, consumer: Consumer) -> None:\n        topic_partitions = consumer.assignment()\n        group_id = consumer._group_id\n        for topic_partition in topic_partitions:\n            topic = topic_partition.topic\n            partition = topic_partition.partition\n\n            metrics_found = False\n            for sample in list(self.MET_LAG.collect())[0].samples:\n                if {\n                    \"topic\": topic,\n                    \"partition\": str(partition),\n                    \"consumer_group\": group_id,\n                } == sample.labels:\n                    metrics_found = True\n\n            if metrics_found:\n                self.MET_LAG.remove(topic, partition, group_id)\n                self.MET_POSITION_LAG.remove(topic, partition, group_id)\n                self.MET_COMMITTED.remove(topic, partition, group_id)\n                self.MET_POSITION.remove(topic, partition, group_id)\n                self.MET_HIGHWATER.remove(topic, partition, group_id)\n            else:\n                logger.debug(\n                    \"Metrics for consumer with group-id: \"\n                    f\"{consumer._group_id} not found\"\n                )\n\n    def add_producer(self, producer):\n        self._producer = producer\n\n    def add_streams(self, streams):\n        self._streams = streams\n\n    async def generate_consumer_metrics(self, consumer: Consumer):\n        \"\"\"\n        Generate Consumer Metrics for Prometheus\n\n        Format:\n            {\n                \"topic-1\": {\n                    \"1\": (\n                        [topic-1, partition-number, 'group-id-1'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                    \"2\": (\n                        [topic-1, partition-number, 'group-id-1'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                },\n                ...\n                \"topic-n\": {\n                    \"1\": (\n                        [topic-n, partition-number, 'group-id-n'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                    \"2\": (\n                        [topic-n, partition-number, 'group-id-n'],\n                        committed, position, highwater, lag, position_lag\n                    )\n                }\n            }\n        \"\"\"\n        metrics: MetricsType = DefaultDict(dict)\n\n        topic_partitions = consumer.assignment()\n\n        for topic_partition in topic_partitions:\n            committed = await consumer.committed(topic_partition) or 0\n            position = await consumer.position(topic_partition)\n            highwater = consumer.highwater(topic_partition)\n\n            lag = position_lag = None\n            if highwater:\n                lag = highwater - committed\n                position_lag = highwater - position\n\n            metrics[topic_partition] = {\n                \"group_id\": consumer._group_id,\n                \"committed\": committed,\n                \"position\": position,\n                \"highwater\": highwater,\n                \"lag\": lag,\n                \"position_lag\": position_lag,\n            }\n\n        self._add_consumer_metrics(metrics)\n\n    async def _metrics_task(self) -> None:\n        \"\"\"\n        Task that runs in `backgroud` to generate\n        consumer metrics.\n\n        When self.running is False the task will finish and it\n        will be safe to stop consumers and producers.\n        \"\"\"\n        while self.running:\n            await asyncio.sleep(self.metrics_scrape_time)\n            for stream in self._streams:\n                if stream.consumer is not None:\n                    try:\n                        await self.generate_consumer_metrics(stream.consumer)\n                    except RuntimeError:\n                        logger.debug(\n                            f\"Metrics for stream {stream.name} can not be generated \"\n                            \"probably because it has been removed\"\n                        )\n
"},{"location":"monitoring/#kstreams.PrometheusMonitor.generate_consumer_metrics","title":"generate_consumer_metrics(consumer) async","text":"

Generate Consumer Metrics for Prometheus

Format

{ \"topic-1\": { \"1\": ( [topic-1, partition-number, 'group-id-1'], committed, position, highwater, lag, position_lag ) \"2\": ( [topic-1, partition-number, 'group-id-1'], committed, position, highwater, lag, position_lag ) }, ... \"topic-n\": { \"1\": ( [topic-n, partition-number, 'group-id-n'], committed, position, highwater, lag, position_lag ) \"2\": ( [topic-n, partition-number, 'group-id-n'], committed, position, highwater, lag, position_lag ) } }

Source code in kstreams/prometheus/monitor.py
async def generate_consumer_metrics(self, consumer: Consumer):\n    \"\"\"\n    Generate Consumer Metrics for Prometheus\n\n    Format:\n        {\n            \"topic-1\": {\n                \"1\": (\n                    [topic-1, partition-number, 'group-id-1'],\n                    committed, position, highwater, lag, position_lag\n                )\n                \"2\": (\n                    [topic-1, partition-number, 'group-id-1'],\n                    committed, position, highwater, lag, position_lag\n                )\n            },\n            ...\n            \"topic-n\": {\n                \"1\": (\n                    [topic-n, partition-number, 'group-id-n'],\n                    committed, position, highwater, lag, position_lag\n                )\n                \"2\": (\n                    [topic-n, partition-number, 'group-id-n'],\n                    committed, position, highwater, lag, position_lag\n                )\n            }\n        }\n    \"\"\"\n    metrics: MetricsType = DefaultDict(dict)\n\n    topic_partitions = consumer.assignment()\n\n    for topic_partition in topic_partitions:\n        committed = await consumer.committed(topic_partition) or 0\n        position = await consumer.position(topic_partition)\n        highwater = consumer.highwater(topic_partition)\n\n        lag = position_lag = None\n        if highwater:\n            lag = highwater - committed\n            position_lag = highwater - position\n\n        metrics[topic_partition] = {\n            \"group_id\": consumer._group_id,\n            \"committed\": committed,\n            \"position\": position,\n            \"highwater\": highwater,\n            \"lag\": lag,\n            \"position_lag\": position_lag,\n        }\n\n    self._add_consumer_metrics(metrics)\n
"},{"location":"monitoring/#consumer-metrics","title":"Consumer Metrics","text":"

We advise including the consumer_lag in your application's grafana dashboard.

consumer_lag will show you how far your consumers are lagging behind the published events in the topic they are reading. For instance, if you have a single consumer and another team is producing millions of events, the consumer might not be able to handle them in time (where in time is defined by you, like: \"in an hour of receiving a message it should be consumed\").

Based on the lag, you will have to develop your own alerts. An alert should be pushed to Slack if you experience more than a particular amount of lag.

You will require your consumer_group name in order to design a basic dashboard using the consumer_lag.

We could add a query in Grafana like this:

sum(kafka_consumer_group_ConsumerLagMetrics_Value{topic =~ \"YOUR_OWN_TOPIC_NAME\", groupId =~\"YOUR_CONSUMER_GROUP\", name=\"SumOffsetLag\"}) by (topic)\n

Remember to replace YOUR_CONSUMER_GROUP and YOUR_OWN_TOPIC_NAME with your consumer_group and topic respectively \u2b06\ufe0f

"},{"location":"monitoring/#producer-metrics","title":"Producer Metrics","text":"

If you have producers, it's a good idea to monitor the growth of Log End Offset (LEO).

The increase in LEO indicates the number of events produced in the last N minutes.

If you know that events should occur every N minutes, you can trigger alerts if no events occur because this metric will tell you whether or not events occurred.

We could add a query in Grafana like this, where N is 10m:

sum(max(increase(kafka_log_Log_Value{name=\"LogEndOffset\", topic =~ \"TOPIC_NAME\"}[10m])) by (partition, topic)) by (topic)\n

Remember to modify TOPIC_NAME to the name of the topic you want to track \u2b06\ufe0f

"},{"location":"monitoring/#custom-business-metrics","title":"Custom Business Metrics","text":"

One benefit of Prometheus is that you can design your own custom metrics.

Scenario: Consider an event-based ordering system. Assume you receive X orders daily and ship Y orders daily. Most likely, you will create a dashboard using this data.

Fortunately, we can create our own custom metrics by using the Prometheus Python client.

You can construct a variety of metrics with prometheus:

  • Gauge
  • Counter
  • Histogram
  • Summary

You can read more about it in prometheus metric_types website.

In our scenario, we will most likely want a Counter for orders received and a Counter for orders shipped.

from prometheus_client import Counter\nfrom kstreams import PrometheusMonitor\n\nclass MyAppPrometheusMonitor(PrometheusMonitor):\n    def __init__(self):\n        super().__init__() # initialize kstream metrics\n        self.orders_received = Counter('orders_received', 'Amount of orders received')\n        self.orders_shipped = Counter('orders_shipped', 'Amount of orders shipped')\n\n    def increase_received(self, amount: int = 1):\n        self.orders_received.inc(amount)\n\n    def increase_shipped(self, amount: int = 1):\n        self.orders_shipped.inc(amount)\n

In our kstreams app, we can:

stream_engine = create_engine(title=\"my-engine\", monitor=MyAppPrometheusMonitor())\n\n@stream_engine.stream(\"my-special-orders\")\nasync def consume_orders_received(cr: ConsumerRecord):\n    if cr.value.status == \"NEW\":\n        stream_engine.monitor.increase_received()\n    elif cr.value.status == \"SHIPPED\":\n        stream_engine.monitor.increase_shipped()\n

Your app's prometheus would display this data, which you might utilize to build a stylish \u2728dashboard\u2728 interface.

For further details, see the Prometheus python client documentation.

"},{"location":"serialization/","title":"Serialization","text":"

Kafka's job is to move bytes from producer to consumers, through a topic.

By default, this is what kstream does.

from kstreams import Stream\n\nfrom .streams_roster import stream_roster\n\nmy_stream = Stream(\n    \"local--hello-world\",\n    func=stream_roster,\n    config={\n        \"group_id\": \"example-group\",\n    },\n)\n

As you can see the ConsumerRecord's value is bytes.

In order to keep your code pythonic, we provide a mechanism to serialize/deserialize these bytes, into something more useful. This way, you can work with other data structures, like a dict or dataclasses.

Sometimes it is easier to work with a dict in your app, give it to kstreams, and let it transform it into bytes to be delivered to Kafka. For this situation, you need to implement kstreams.serializers.Serializer.

The other situation is when you consume from Kafka (or other brokers). Instead of dealing with bytes, you may want to receive in your function the dict ready to be used. For those cases, we need to use middleware. For example, we can implement a JsonMiddleware:

from kstreams import middleware, ConsumerRecord\n\n\nclass JsonDeserializerMiddleware(middleware.BaseMiddleware):\n    async def __call__(self, cr: ConsumerRecord):\n        if cr.value is not None:\n            data = json.loads(cr.value.decode())\n            cr.value = data\n        return await self.next_call(cr)\n

It is also possble to use kstreams.serializers.Deserializer for deserialization, but this will be deprecated

Warning

kstreams.serializers.Deserializer will be deprecated, use middlewares instead

"},{"location":"serialization/#kstreams.serializers.Serializer","title":"kstreams.serializers.Serializer","text":"

Protocol used by the Stream to serialize.

A Protocol is similar to other languages features like an interface or a trait.

End users should provide their own class implementing this protocol.

For example a JsonSerializer

from typing import Optional, Dict\nimport json\n\nclass JsonSerializer:\n\n    async def serialize(\n        self,\n        payload: dict,\n        headers: Optional[Dict[str, str]] = None,\n        serializer_kwargs: Optional[Dict] = None,\n    ) -> bytes:\n        \"\"\"Return UTF-8 encoded payload\"\"\"\n        value = json.dumps(payload)\n        return value.encode()\n

Notice that you don't need to inherit anything, you just have to comply with the Protocol.

Source code in kstreams/serializers.py
class Serializer(Protocol):\n    \"\"\"Protocol used by the Stream to serialize.\n\n    A Protocol is similar to other languages features like an interface or a trait.\n\n    End users should provide their own class implementing this protocol.\n\n    For example a `JsonSerializer`\n\n    ```python\n    from typing import Optional, Dict\n    import json\n\n    class JsonSerializer:\n\n        async def serialize(\n            self,\n            payload: dict,\n            headers: Optional[Dict[str, str]] = None,\n            serializer_kwargs: Optional[Dict] = None,\n        ) -> bytes:\n            \\\"\"\"Return UTF-8 encoded payload\\\"\"\"\n            value = json.dumps(payload)\n            return value.encode()\n    ```\n\n    Notice that you don't need to inherit anything,\n    you just have to comply with the Protocol.\n    \"\"\"\n\n    async def serialize(\n        self,\n        payload: Any,\n        headers: Optional[Headers] = None,\n        serializer_kwargs: Optional[Dict] = None,\n    ) -> bytes:\n        \"\"\"\n        Implement this method to deserialize the data received from the topic.\n        \"\"\"\n        ...\n
"},{"location":"serialization/#kstreams.serializers.Serializer.serialize","title":"serialize(payload, headers=None, serializer_kwargs=None) async","text":"

Implement this method to deserialize the data received from the topic.

Source code in kstreams/serializers.py
async def serialize(\n    self,\n    payload: Any,\n    headers: Optional[Headers] = None,\n    serializer_kwargs: Optional[Dict] = None,\n) -> bytes:\n    \"\"\"\n    Implement this method to deserialize the data received from the topic.\n    \"\"\"\n    ...\n
"},{"location":"serialization/#kstreams.serializers.Deserializer","title":"kstreams.serializers.Deserializer","text":"

Protocol used by the Stream to deserialize.

A Protocol is similar to other languages features like an interface or a trait.

End users should provide their own class implementing this protocol.

For example a JsonDeserializer

import json\nfrom kstreams import ConsumerRecord\n\nclass JsonDeserializer:\n\n    async def deserialize(\n        self, consumer_record: ConsumerRecord, **kwargs\n    ) -> ConsumerRecord:\n        data = json.loads(consumer_record.value.decode())\n        consumer_record.value = data\n        return consumer_record\n
Source code in kstreams/serializers.py
class Deserializer(Protocol):\n    \"\"\"Protocol used by the Stream to deserialize.\n\n    A Protocol is similar to other languages features like an interface or a trait.\n\n    End users should provide their own class implementing this protocol.\n\n    For example a `JsonDeserializer`\n\n    ```python\n    import json\n    from kstreams import ConsumerRecord\n\n    class JsonDeserializer:\n\n        async def deserialize(\n            self, consumer_record: ConsumerRecord, **kwargs\n        ) -> ConsumerRecord:\n            data = json.loads(consumer_record.value.decode())\n            consumer_record.value = data\n            return consumer_record\n    ```\n    \"\"\"\n\n    async def deserialize(\n        self, consumer_record: ConsumerRecord, **kwargs\n    ) -> ConsumerRecord:\n        \"\"\"\n        Implement this method to deserialize the data received from the topic.\n        \"\"\"\n        ...\n
"},{"location":"serialization/#kstreams.serializers.Deserializer.deserialize","title":"deserialize(consumer_record, **kwargs) async","text":"

Implement this method to deserialize the data received from the topic.

Source code in kstreams/serializers.py
async def deserialize(\n    self, consumer_record: ConsumerRecord, **kwargs\n) -> ConsumerRecord:\n    \"\"\"\n    Implement this method to deserialize the data received from the topic.\n    \"\"\"\n    ...\n
"},{"location":"serialization/#usage","title":"Usage","text":"

Once you have written your serializer or deserializer, there are 2 ways of using them, in a generic fashion or per stream.

"},{"location":"serialization/#initialize-the-engine-with-your-serializers","title":"Initialize the engine with your serializers","text":"

By doing this all the streams will use these serializers by default.

stream_engine = create_engine(\n    title=\"my-stream-engine\",\n    serializer=JsonSerializer(),\n)\n
"},{"location":"serialization/#initilize-streams-with-a-deserializer-and-produce-events-with-serializers","title":"Initilize streams with a deserializer and produce events with serializers","text":"
from kstreams import middleware, ConsumerRecord\n\n\n@stream_engine.stream(topic, middlewares=[middleware.Middleware(JsonDeserializerMiddleware)])\nasync def hello_stream(cr: ConsumerRecord):\n    # remember event.value is now a dict\n    print(cr.value[\"message\"])\n    save_to_db(cr)\n
await stream_engine.send(\n    topic,\n    value={\"message\": \"test\"}\n    headers={\"content-type\": consts.APPLICATION_JSON,}\n    key=\"1\",\n)\n
"},{"location":"stream/","title":"Streams","text":"

A Stream in kstreams is an extension of AIOKafkaConsumer

Consuming can be done using kstreams.Stream. You only need to decorate a coroutine with @stream_engine.streams. The decorator has the same aiokafka consumer API at initialization, in other words they accept the same args and kwargs that the aiokafka consumer accepts.

"},{"location":"stream/#kstreams.streams.Stream","title":"kstreams.streams.Stream","text":"

Attributes:

Name Type Description name Optional[str]

Stream name. Default is a generated uuid4

topics List[str]

List of topics to consume

subscribe_by_pattern bool

Whether subscribe to topics by pattern

backend Kafka

backend kstreams.backends.kafka.Kafka: Backend to connect. Default Kafka

func Callable[[Stream], Awaitable[Any]]

Coroutine fucntion or generator to be called when an event arrives

config Dict[str, Any]

Stream configuration. Here all the properties can be passed in the dictionary

deserializer Deserializer

Deserializer to be used when an event is consumed

initial_offsets List[TopicPartitionOffset]

List of TopicPartitionOffset that will seek the initial offsets to

rebalance_listener RebalanceListener

Listener callbacks when partition are assigned or revoked

"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-a-topic","title":"Subscribe to a topic","text":"

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\"local--kstreams\", group_id=\"my-group-id\")\nasync def stream(cr: ConsumerRecord) -> None:\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nasync def start():\n    await stream_engine.start()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(\n        start(),\n        stop_on_unhandled_errors=True,\n        shutdown_callback=shutdown\n    )\n
"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-multiple-topics","title":"Subscribe to multiple topics","text":"

Consuming from multiple topics using one stream is possible. A List[str] of topics must be provided.

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    [\"local--kstreams\", \"local--hello-world\"],\n    group_id=\"my-group-id\",\n)\nasync def consume(cr: ConsumerRecord) -> None:\n    print(f\"Event from {cr.topic}: headers: {cr.headers}, payload: {cr.value}\")\n
"},{"location":"stream/#kstreams.streams.Stream--subscribe-to-topics-by-pattern","title":"Subscribe to topics by pattern","text":"

In the following example the stream will subscribe to any topic that matches the regex ^dev--customer-.*, for example dev--customer-invoice or dev--customer-profile. The subscribe_by_pattern flag must be set to True.

Example

import aiorun\nfrom kstreams import create_engine, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    topics=\"^dev--customer-.*$\",\n    subscribe_by_pattern=True,\n    group_id=\"my-group-id\",\n)\nasync def stream(cr: ConsumerRecord) -> None:\n    if cr.topic == \"dev--customer-invoice\":\n        print(\"Event from topic dev--customer-invoice\"\n    elif cr.topic == \"dev--customer-profile\":\n        print(\"Event from topic dev--customer-profile\"\n    else:\n        raise ValueError(f\"Invalid topic {cr.topic}\")\n\n\nasync def start():\n    await stream_engine.start()\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(\n        start(),\n        stop_on_unhandled_errors=True,\n        shutdown_callback=shutdown\n    )\n
"},{"location":"stream/#dependency-injection","title":"Dependency Injection","text":"

The old way to itereate over a stream is with the async for _ in stream loop. The iterable approach works but in most cases end users are interested only in the ConsumerRecord, for this reason it is possible to remove the async for loop using proper typing hints. The available typing hints are:

  • ConsumerRecord: The aiokafka ConsumerRecord that will be received every time that a new event is in the Stream
  • Stream: The Stream object that is subscribed to the topic/s. Useful when manual commit is enabled or when other Stream operations are needed
  • Send: Coroutine to produce events. The same as stream_engine.send(...)

if you use type hints then every time that a new event is in the stream the coroutine function defined by the end user will ba awaited with the specified types

ConsumerRecordConsumerRecord and StreamConsumerRecord, Stream and SendOld fashion
@stream_engine.stream(topic)\nasync def my_stream(cr: ConsumerRecord):\n    print(cr.value)\n
@stream_engine.stream(topic, enable_auto_commit=False)\nasync def my_stream(cr: ConsumerRecord, stream: Stream):\n    print(cr.value)\n    await stream.commit()\n
@stream_engine.stream(topic, enable_auto_commit=False)\nasync def my_stream(cr: ConsumerRecord, stream: Stream, send: Send):\n    print(cr.value)\n    await stream.commit()\n    await send(\"sink-to-elastic-topic\", value=cr.value)\n
@stream_engine.stream(topic)\nasync def consume(stream):  # you can specify the type but it will be the same result\n    async for cr in stream:\n        print(cr.value)\n        # you can do something with the stream as well!!\n

Note

The type arguments can be in any order. This might change in the future.

Warning

It is still possible to use the async for in loop, but it might be removed in the future. Migrate to the typing approach

"},{"location":"stream/#creating-a-stream-instance","title":"Creating a Stream instance","text":"

If for any reason you need to create Streams instances directly, you can do it without using the decorator stream_engine.stream.

Stream instance
import aiorun\nfrom kstreams import create_engine, Stream, ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\nclass MyDeserializer:\n\n    async def deserialize(self, consumer_record: ConsumerRecord, **kwargs):\n        return consumer_record.value.decode()\n\n\nasync def stream(cr: ConsumerRecord) -> None:\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n\nstream = Stream(\n    \"local--kstreams\",\n    name=\"my-stream\"\n    func=stream,  # coroutine or async generator\n    deserializer=MyDeserializer(),\n)\n# add the stream to the engine\nstream_engine.add_stream(stream)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\nif __name__ == \"__main__\":\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n
"},{"location":"stream/#removing-a-stream-from-the-engine","title":"Removing a stream from the engine","text":"Removing stream
stream_engine.remove_stream(stream)\n
"},{"location":"stream/#starting-the-stream-with-initial-offsets","title":"Starting the stream with initial offsets","text":"

If you want to start your consumption from certain offsets, you can include that in your stream instantiation.

Use case: This feature is useful if one wants to manage their own offsets, rather than committing consumed offsets to Kafka. When an application manages its own offsets and tries to start a stream, we start the stream using the initial offsets as defined in the database.

If you try to seek on a partition or topic that is not assigned to your stream, the code will ignore the seek and print out a warning. For example, if you have two consumers that are consuming from different partitions, and you try to seek for all of the partitions on each consumer, each consumer will seek for the partitions it has been assigned, and it will print out a warning log for the ones it was not assigned.

If you try to seek on offsets that are not yet present on your partition, the consumer will revert to the auto_offset_reset config. There will not be a warning, so be aware of this.

Also be aware that when your application restarts, it most likely will trigger the initial_offsets again. This means that setting intial_offsets to be a hardcoded number might not get the results you expect.

Initial Offsets from Database
from kstreams import Stream, structs\n\n\ntopic_name = \"local--kstreams\"\ndb_table = ExampleDatabase()\ninitial_offset = structs.TopicPartitionOffset(topic=topic_name, partition=0, offset=db_table.offset)\n\n\nasync def my_stream(stream: Stream):\n    ...\n\n\nstream = Stream(\n    topic_name,\n    name=\"my-stream\",\n    func=my_stream,  # coroutine or async generator\n    deserializer=MyDeserializer(),\n    initial_offsets=[initial_offset],\n)\n
"},{"location":"stream/#stream-crashing","title":"Stream crashing","text":"

If your stream crashes for any reason the event consumption is stopped, meaning that non event will be consumed from the topic. However, it is possible to set three different error policies per stream:

  • StreamErrorPolicy.STOP (default): Stop the Stream when an exception occurs. The exception is raised after the stream is properly stopped.
  • StreamErrorPolicy.RESTART: Stop and restart the Stream when an exception occurs. The event that caused the exception is skipped. The exception is NOT raised because the application should contine working, however logger.exception() is used to alert the user.
  • StreamErrorPolicy.STOP_ENGINE: Stop the StreamEngine when an exception occurs. The exception is raised after ALL the Streams were properly stopped.
  • StreamErrorPolicy.STOP_APPLICATION: Stop the StreamEngine when an exception occurs and raises signal.SIGTERM. Useful when using kstreams with other libraries such us FastAPI.

In the following example, the StreamErrorPolicy.RESTART error policy is specifed. If the Stream crashed with the ValueError exception it is restarted:

from kstreams import create_engine, ConsumerRecord\nfrom kstreams.stream_utils import StreamErrorPolicy\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(\n    \"local--hello-world\",\n    group_id=\"example-group\",\n    error_policy=StreamErrorPolicy.RESTART\n)\nasync def stream(cr: ConsumerRecord) -> None:\n    if cr.key == b\"error\":\n        # Stream will be restarted after the ValueError is raised\n        raise ValueError(\"error....\")\n\n    print(f\"Event consumed. Payload {cr.value}\")\n

We can see the logs:

ValueError: error....\nINFO:aiokafka.consumer.group_coordinator:LeaveGroup request succeeded\nINFO:aiokafka.consumer.consumer:Unsubscribed all topics or patterns and assigned partitions\nINFO:kstreams.streams:Stream consuming from topics ['local--hello-world'] has stopped!!! \n\n\nINFO:kstreams.middleware.middleware:Restarting stream <kstreams.streams.Stream object at 0x102d44050>\nINFO:aiokafka.consumer.subscription_state:Updating subscribed topics to: frozenset({'local--hello-world'})\n...\nINFO:aiokafka.consumer.group_coordinator:Setting newly assigned partitions {TopicPartition(topic='local--hello-world', partition=0)} for group example-group\n

Note

If you are using aiorun with stop_on_unhandled_errors=True and the error_policy is StreamErrorPolicy.RESTART then the application will NOT stop as the exception that caused the Stream to crash is not raised

"},{"location":"stream/#changing-consumer-behavior","title":"Changing consumer behavior","text":"

Most of the time you will only set the topic and the group_id to the consumer, but sometimes you might want more control over it, for example changing the policy for resetting offsets on OffsetOutOfRange errors or session timeout. To do this, you have to use the same kwargs as the aiokafka consumer API

# The consumer sends periodic heartbeats every 500 ms\n# On OffsetOutOfRange errors, the offset will move to the oldest available message (\u2018earliest\u2019)\n\n@stream_engine.stream(\"local--kstream\", group_id=\"de-my-partition\", session_timeout_ms=500, auto_offset_reset\"earliest\")\nasync def stream(cr: ConsumerRecord):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n
"},{"location":"stream/#manual-commit","title":"Manual commit","text":"

When processing more sensitive data and you want to be sure that the kafka offeset is commited once that you have done your tasks, you can use enable_auto_commit=False mode of Consumer.

Manual commit example
@stream_engine.stream(\"local--kstream\", group_id=\"de-my-partition\", enable_auto_commit=False)\nasync def stream(cr: ConsumerRecord, stream: Stream):\n    print(f\"Event consumed: headers: {cr.headers}, payload: {cr.value}\")\n\n    # We need to make sure that the pyalod was stored before commiting the kafka offset\n    await store_in_database(payload)\n    await stream.commit()  # You need to commit!!!\n

Note

This is a tradeoff from at most once to at least once delivery, to achieve exactly once you will need to save offsets in the destination database and validate those yourself.

"},{"location":"stream/#yield-from-stream","title":"Yield from stream","text":"

Sometimes is useful to yield values from a stream so you can consume events in your on phase or because you want to return results to the frontend (SSE example). If you use the yield keyword inside a coroutine it will be \"transform\" to a asynchronous generator function, meaning that inside there is an async generator and it can be consumed.

Consuming an async generator is simple, you just use the async for in clause. Because consuming events only happens with the for loop, you have to make sure that the Stream has been started properly and after leaving the async for in the stream has been properly stopped.

To facilitate the process, we have context manager that makes sure of the starting/stopping process.

Yield example
# Create your stream\n@stream_engine.stream(\"local--kstream\")\nasync def stream(cr: ConsumerRecord, stream: Stream):\n    yield cr.value\n\n\n# Consume the stream:\nasync with stream as stream_flow:  # Use the context manager\n    async for value in stream_flow:\n        ...\n        # do something with value (cr.value)\n

Note

If for some reason you interrupt the \"async for in\" in the async generator, the Stream will stopped consuming events meaning that the lag will increase.

Note

Yield from a stream only works with the typing approach

"},{"location":"stream/#get-many","title":"Get many","text":"

Get a batch of events from the assigned TopicPartition.

Prefetched events are returned in batches by topic-partition. If messages is not available in the prefetched buffer this method waits timeout_ms milliseconds.

Attributes:

Name Type Description partitions List[TopicPartition] | None

The partitions that need fetching message. If no one partition specified then all subscribed partitions will be used

timeout_ms int | None

milliseconds spent waiting if data is not available in the buffer. If 0, returns immediately with any records that are available currently in the buffer, else returns empty. Must not be negative.

max_records int | None

The amount of records to fetch. if timeout_ms was defined and reached and the fetched records has not reach max_records then returns immediately with any records that are available currently in the buffer

Returns:

Type Description Dict[TopicPartition, List[ConsumerRecord]]

Topic to list of records

Example

@stream_engine.stream(topic, ...)\nasync def stream(stream: Stream):\n    while True:\n        data = await stream.getmany(max_records=5)\n        print(data)\n
Source code in kstreams/streams.py
async def getmany(\n    self,\n    partitions: typing.Optional[typing.List[TopicPartition]] = None,\n    timeout_ms: int = 0,\n    max_records: typing.Optional[int] = None,\n) -> typing.Dict[TopicPartition, typing.List[ConsumerRecord]]:\n    \"\"\"\n    Get a batch of events from the assigned TopicPartition.\n\n    Prefetched events are returned in batches by topic-partition.\n    If messages is not available in the prefetched buffer this method waits\n    `timeout_ms` milliseconds.\n\n    Attributes:\n        partitions List[TopicPartition] | None: The partitions that need\n            fetching message. If no one partition specified then all\n            subscribed partitions will be used\n        timeout_ms int | None: milliseconds spent waiting if\n            data is not available in the buffer. If 0, returns immediately\n            with any records that are available currently in the buffer,\n            else returns empty. Must not be negative.\n        max_records int | None: The amount of records to fetch.\n            if `timeout_ms` was defined and reached and the fetched records\n            has not reach `max_records` then returns immediately\n            with any records that are available currently in the buffer\n\n    Returns:\n        Topic to list of records\n\n    !!! Example\n        ```python\n        @stream_engine.stream(topic, ...)\n        async def stream(stream: Stream):\n            while True:\n                data = await stream.getmany(max_records=5)\n                print(data)\n        ```\n    \"\"\"\n    partitions = partitions or []\n    return await self.consumer.getmany(  # type: ignore\n        *partitions, timeout_ms=timeout_ms, max_records=max_records\n    )\n

Warning

This approach does not works with Dependency Injection.

"},{"location":"stream/#rebalance-listener","title":"Rebalance Listener","text":"

For some cases you will need a RebalanceListener so when partitions are assigned or revoked to the stream different accions can be performed.

"},{"location":"stream/#use-cases","title":"Use cases","text":"
  • Cleanup or custom state save on the start of a rebalance operation
  • Saving offsets in a custom store when a partition is revoked
  • Load a state or cache warmup on completion of a successful partition re-assignment.
"},{"location":"stream/#metrics-rebalance-listener","title":"Metrics Rebalance Listener","text":"

Kstreams use a default listener for all the streams to clean the metrics after a rebalance takes place

"},{"location":"stream/#kstreams.MetricsRebalanceListener","title":"kstreams.MetricsRebalanceListener","text":"Source code in kstreams/rebalance_listener.py
class MetricsRebalanceListener(RebalanceListener):\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        This will method will clean up the `Prometheus` metrics\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n        \"\"\"\n        # lock all asyncio Tasks so no new metrics will be added to the Monitor\n        if revoked and self.engine is not None:\n            async with asyncio.Lock():\n                if self.stream is not None and self.stream.consumer is not None:\n                    self.engine.monitor.clean_stream_consumer_metrics(\n                        self.stream.consumer\n                    )\n\n    async def on_partitions_assigned(\n        self, assigned: typing.Set[TopicPartition]\n    ) -> None:\n        \"\"\"\n        Coroutine to be called *after* partition re-assignment completes\n        and *before* the consumer starts fetching data again.\n\n        This method will start the `Prometheus` metrics\n\n        Attributes:\n            assigned Set[TopicPartition]: Partitions assigned to the\n                consumer (may include partitions that were previously assigned)\n        \"\"\"\n        # lock all asyncio Tasks so no new metrics will be added to the Monitor\n        if assigned and self.engine is not None:\n            async with asyncio.Lock():\n                if self.stream is not None:\n                    self.stream.seek_to_initial_offsets()\n
"},{"location":"stream/#kstreams.MetricsRebalanceListener.on_partitions_assigned","title":"on_partitions_assigned(assigned) async","text":"

Coroutine to be called after partition re-assignment completes and before the consumer starts fetching data again.

This method will start the Prometheus metrics

Attributes:

Name Type Description assigned Set[TopicPartition]

Partitions assigned to the consumer (may include partitions that were previously assigned)

Source code in kstreams/rebalance_listener.py
async def on_partitions_assigned(\n    self, assigned: typing.Set[TopicPartition]\n) -> None:\n    \"\"\"\n    Coroutine to be called *after* partition re-assignment completes\n    and *before* the consumer starts fetching data again.\n\n    This method will start the `Prometheus` metrics\n\n    Attributes:\n        assigned Set[TopicPartition]: Partitions assigned to the\n            consumer (may include partitions that were previously assigned)\n    \"\"\"\n    # lock all asyncio Tasks so no new metrics will be added to the Monitor\n    if assigned and self.engine is not None:\n        async with asyncio.Lock():\n            if self.stream is not None:\n                self.stream.seek_to_initial_offsets()\n
"},{"location":"stream/#kstreams.MetricsRebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

This will method will clean up the Prometheus metrics

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    This will method will clean up the `Prometheus` metrics\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n    \"\"\"\n    # lock all asyncio Tasks so no new metrics will be added to the Monitor\n    if revoked and self.engine is not None:\n        async with asyncio.Lock():\n            if self.stream is not None and self.stream.consumer is not None:\n                self.engine.monitor.clean_stream_consumer_metrics(\n                    self.stream.consumer\n                )\n
"},{"location":"stream/#manual-commit_1","title":"Manual Commit","text":"

If manual commit is enabled, you migh want to use the ManualCommitRebalanceListener. This rebalance listener will call commit before the stream partitions are revoked to avoid the error CommitFailedError and duplicate message delivery after a rebalance. See code example with manual commit

Note

ManualCommitRebalanceListener also includes the MetricsRebalanceListener funcionality.

"},{"location":"stream/#kstreams.ManualCommitRebalanceListener","title":"kstreams.ManualCommitRebalanceListener","text":"Source code in kstreams/rebalance_listener.py
class ManualCommitRebalanceListener(MetricsRebalanceListener):\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        If manual commit is enabled, `commit` is called before the consumers\n        partitions are revoked to prevent the error `CommitFailedError`\n        and duplicate message delivery after a rebalance.\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n        \"\"\"\n        if (\n            revoked\n            and self.stream is not None\n            and self.stream.consumer is not None\n            and not self.stream.consumer._enable_auto_commit\n        ):\n            logger.info(\n                f\"Manual commit enabled for stream {self.stream}. \"\n                \"Performing `commit` before revoking partitions\"\n            )\n            async with asyncio.Lock():\n                await self.stream.commit()\n\n            await super().on_partitions_revoked(revoked=revoked)\n
"},{"location":"stream/#kstreams.ManualCommitRebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

If manual commit is enabled, commit is called before the consumers partitions are revoked to prevent the error CommitFailedError and duplicate message delivery after a rebalance.

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    If manual commit is enabled, `commit` is called before the consumers\n    partitions are revoked to prevent the error `CommitFailedError`\n    and duplicate message delivery after a rebalance.\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n    \"\"\"\n    if (\n        revoked\n        and self.stream is not None\n        and self.stream.consumer is not None\n        and not self.stream.consumer._enable_auto_commit\n    ):\n        logger.info(\n            f\"Manual commit enabled for stream {self.stream}. \"\n            \"Performing `commit` before revoking partitions\"\n        )\n        async with asyncio.Lock():\n            await self.stream.commit()\n\n        await super().on_partitions_revoked(revoked=revoked)\n
"},{"location":"stream/#custom-rebalance-listener","title":"Custom Rebalance Listener","text":"

If you want to define a custom RebalanceListener, it has to inherits from kstreams.RebalanceListener.

Note

It also possible to inherits from ManualCommitRebalanceListener and MetricsRebalanceListener

"},{"location":"stream/#kstreams.RebalanceListener","title":"kstreams.RebalanceListener","text":"

A callback interface that the user can implement to trigger custom actions when the set of partitions are assigned or revoked to the Stream.

Example

from kstreams import RebalanceListener, TopicPartition\nfrom .resource import stream_engine\n\n\nclass MyRebalanceListener(RebalanceListener):\n\n    async def on_partitions_revoked(\n        self, revoked: Set[TopicPartition]\n    ) -> None:\n        # Do something with the revoked partitions\n        # or with the Stream\n        print(self.stream)\n\n    async def on_partitions_assigned(\n        self, assigned: Set[TopicPartition]\n    ) -> None:\n        # Do something with the assigned partitions\n        # or with the Stream\n        print(self.stream)\n\n\n@stream_engine.stream(topic, rebalance_listener=MyRebalanceListener())\nasync def my_stream(stream: Stream):\n    async for event in stream:\n        ...\n
Source code in kstreams/rebalance_listener.py
class RebalanceListener(ConsumerRebalanceListener):\n    \"\"\"\n    A callback interface that the user can implement to trigger custom actions\n    when the set of partitions are assigned or revoked to the `Stream`.\n\n    !!! Example\n        ```python\n        from kstreams import RebalanceListener, TopicPartition\n        from .resource import stream_engine\n\n\n        class MyRebalanceListener(RebalanceListener):\n\n            async def on_partitions_revoked(\n                self, revoked: Set[TopicPartition]\n            ) -> None:\n                # Do something with the revoked partitions\n                # or with the Stream\n                print(self.stream)\n\n            async def on_partitions_assigned(\n                self, assigned: Set[TopicPartition]\n            ) -> None:\n                # Do something with the assigned partitions\n                # or with the Stream\n                print(self.stream)\n\n\n        @stream_engine.stream(topic, rebalance_listener=MyRebalanceListener())\n        async def my_stream(stream: Stream):\n            async for event in stream:\n                ...\n        ```\n    \"\"\"\n\n    def __init__(self) -> None:\n        self.stream: typing.Optional[\"Stream\"] = None\n        # engine added so it can react on rebalance events\n        self.engine: typing.Optional[\"StreamEngine\"] = None\n\n    async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n        \"\"\"\n        Coroutine to be called *before* a rebalance operation starts and\n        *after* the consumer stops fetching data.\n\n        If you are using manual commit you have to commit all consumed offsets\n        here, to avoid duplicate message delivery after rebalance is finished.\n\n        Use cases:\n            - cleanup or custom state save on the start of a rebalance operation\n            - saving offsets in a custom store\n\n        Attributes:\n            revoked Set[TopicPartitions]: Partitions that were assigned\n                to the consumer on the last rebalance\n\n        !!! note\n            The `Stream` is available using `self.stream`\n        \"\"\"\n        ...  # pragma: no cover\n\n    async def on_partitions_assigned(\n        self, assigned: typing.Set[TopicPartition]\n    ) -> None:\n        \"\"\"\n        Coroutine to be called *after* partition re-assignment completes\n        and *before* the consumer starts fetching data again.\n\n        It is guaranteed that all the processes in a consumer group will\n        execute their `on_partitions_revoked` callback before any instance\n        executes its `on_partitions_assigned` callback.\n\n        Use cases:\n            - Load a state or cache warmup on completion of a successful\n            partition re-assignment.\n\n        Attributes:\n            assigned Set[TopicPartition]: Partitions assigned to the\n                consumer (may include partitions that were previously assigned)\n\n        !!! note\n            The `Stream` is available using `self.stream`\n        \"\"\"\n        ...  # pragma: no cover\n
"},{"location":"stream/#kstreams.RebalanceListener.on_partitions_assigned","title":"on_partitions_assigned(assigned) async","text":"

Coroutine to be called after partition re-assignment completes and before the consumer starts fetching data again.

It is guaranteed that all the processes in a consumer group will execute their on_partitions_revoked callback before any instance executes its on_partitions_assigned callback.

Use cases
  • Load a state or cache warmup on completion of a successful partition re-assignment.

Attributes:

Name Type Description assigned Set[TopicPartition]

Partitions assigned to the consumer (may include partitions that were previously assigned)

Note

The Stream is available using self.stream

Source code in kstreams/rebalance_listener.py
async def on_partitions_assigned(\n    self, assigned: typing.Set[TopicPartition]\n) -> None:\n    \"\"\"\n    Coroutine to be called *after* partition re-assignment completes\n    and *before* the consumer starts fetching data again.\n\n    It is guaranteed that all the processes in a consumer group will\n    execute their `on_partitions_revoked` callback before any instance\n    executes its `on_partitions_assigned` callback.\n\n    Use cases:\n        - Load a state or cache warmup on completion of a successful\n        partition re-assignment.\n\n    Attributes:\n        assigned Set[TopicPartition]: Partitions assigned to the\n            consumer (may include partitions that were previously assigned)\n\n    !!! note\n        The `Stream` is available using `self.stream`\n    \"\"\"\n    ...  # pragma: no cover\n
"},{"location":"stream/#kstreams.RebalanceListener.on_partitions_revoked","title":"on_partitions_revoked(revoked) async","text":"

Coroutine to be called before a rebalance operation starts and after the consumer stops fetching data.

If you are using manual commit you have to commit all consumed offsets here, to avoid duplicate message delivery after rebalance is finished.

Use cases
  • cleanup or custom state save on the start of a rebalance operation
  • saving offsets in a custom store

Attributes:

Name Type Description revoked Set[TopicPartitions]

Partitions that were assigned to the consumer on the last rebalance

Note

The Stream is available using self.stream

Source code in kstreams/rebalance_listener.py
async def on_partitions_revoked(self, revoked: typing.Set[TopicPartition]) -> None:\n    \"\"\"\n    Coroutine to be called *before* a rebalance operation starts and\n    *after* the consumer stops fetching data.\n\n    If you are using manual commit you have to commit all consumed offsets\n    here, to avoid duplicate message delivery after rebalance is finished.\n\n    Use cases:\n        - cleanup or custom state save on the start of a rebalance operation\n        - saving offsets in a custom store\n\n    Attributes:\n        revoked Set[TopicPartitions]: Partitions that were assigned\n            to the consumer on the last rebalance\n\n    !!! note\n        The `Stream` is available using `self.stream`\n    \"\"\"\n    ...  # pragma: no cover\n
"},{"location":"test_client/","title":"Testing","text":"

To test streams and producers or perform e2e tests you can make use of the test_utils.TestStreamClient.

The TestStreamClient aims to emulate as much as possible the kafka behaviour using asyncio.Queue. This is excellent because you can test quite easily your code without spinning up kafka, but this comes with some limitations. It is not possible to know beforehand how many topics exist, how many partitions per topic exist, the replication factor, current offsets, etc. So, the test client will create topics, partitions, assigments, etc on runtime. Each Stream in your application will have assigned 3 partitions per topic by default (0, 1 and 2) during test environment

With the test client you can:

  • Send events so you won't need to mock the producer
  • Call the consumer code, then the client will make sure that all the events are consumed before leaving the async context
"},{"location":"test_client/#using-teststreamclient","title":"Using TestStreamClient","text":"

Import TestStreamClient.

Create a TestStreamClient by passing the engine instance to it.

Create functions with a name that starts with test_ (this is standard pytest conventions).

Use the TestStreamClient object the same way as you do with engine.

Write simple assert statements with the standard Python expressions that you need to check (again, standard pytest).

"},{"location":"test_client/#example","title":"Example","text":"

Let's assume that you have the following code example. The goal is to store all the consumed events in an EventStore for future analysis.

# example.py\nimport aiorun\nimport typing\nfrom dataclasses import dataclass, field\n\nfrom kstreams import ConsumerRecord, create_engine\nfrom kstreams.streams import Stream\n\ntopic = \"local--kstreams\"\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@dataclass\nclass EventStore:\n    \"\"\"\n    Store events in memory\n    \"\"\"\n    events: typing.List[ConsumerRecord] = field(default_factory=list)\n\n    def add(self, event: ConsumerRecord) -> None:\n        self.events.append(event)\n\n    @property\n    def total(self):\n        return len(self.events)\n\n\nevent_store = EventStore()\n\n\n@stream_engine.stream(topic, group_id=\"example-group\")\nasync def consume(cr: ConsumerRecord):\n    event_store.add(cr)\n\n\nasync def produce():\n    payload = b'{\"message\": \"Hello world!\"}'\n\n    for _ in range(5):\n        await stream_engine.send(topic, value=payload, key=\"1\")\n        await asyncio.sleep(2)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\ndef main():\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

Then you could have a test_stream.py file to test the code, you need to instanciate the TestStreamClient with the engine:

# test_stream.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom example import stream_engine, event_store\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_add_event_on_consume():\n    \"\"\"\n    Produce some events and check that the EventStore is updated.\n    \"\"\"\n    topic = \"local--kstreams\"  # Use the same topic as the stream\n    event = b'{\"message\": \"Hello world!\"}'\n\n    async with client:\n        metadata = await client.send(topic, value=event, key=\"1\")  # send the event with the test client\n        current_offset = metadata.offset\n        assert metadata.topic == topic\n\n        # send another event and check that the offset was incremented\n        metadata = await client.send(topic, value=b'{\"message\": \"Hello world!\"}', key=\"1\")\n        assert metadata.offset == current_offset + 1\n\n    # check that the event_store has 2 events stored\n    assert event_store.total == 2\n

Note

Notice that the produce coroutine is not used to send events in the test case. The TestStreamClient.send coroutine is used instead. This allows to test streams without having producer code in your application

"},{"location":"test_client/#testing-the-commit","title":"Testing the Commit","text":"

In some cases your stream will commit, in this situation checking the commited partitions can be useful.

import pytest\nfrom kstreams.test_utils import TestStreamClient\nfrom kstreams import ConsumerRecord, Stream, TopicPartition\n\nfrom .example import produce, stream_engine\n\ntopic_name = \"local--kstreams-marcos\"\nvalue = b'{\"message\": \"Hello world!\"}'\nname = \"my-stream\"\nkey = \"1\"\npartition = 2\ntp = TopicPartition(\n    topic=topic_name,\n    partition=partition,\n)\ntotal_events = 10\n\n@stream_engine.stream(topic_name, name=name)\nasync def my_stream(cr: ConsumerRecord, stream: Stream):\n    # commit every time that an event arrives\n    await stream.commit({tp: cr.offset})\n\n\n# test the code\nclient = TestStreamClient(stream_engine)\n\n@pytest.mark.asyncio\nasync def test_consumer_commit(stream_engine: StreamEngine):\n    async with client:\n        for _ in range(0, total_events):\n            await client.send(topic_name, partition=partition, value=value, key=key)\n\n        # check that everything was commited\n        stream = stream_engine.get_stream(name)\n        assert (await stream.committed(tp)) == total_events\n
"},{"location":"test_client/#e2e-test","title":"E2E test","text":"

In the previous code example the application produces to and consumes from the same topic, then TestStreamClient.send is not needed because the engine.send is producing. For those situation you can just use your producer code and check that certain code was called.

# test_example.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .example import produce, stream_engine\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_e2e_example():\n    \"\"\"\n    Test that events are produce by the engine and consumed by the streams\n    \"\"\"\n    with patch(\"example.on_consume\") as on_consume, patch(\"example.on_produce\") as on_produce:\n        async with client:\n            await produce()\n\n    on_produce.call_count == 5\n    on_consume.call_count == 5\n
"},{"location":"test_client/#producer-only","title":"Producer only","text":"

In some scenarios, your application will only produce events and other application/s will consume it, but you want to make sure that the event was procuced in a proper way and the topic contains that event.

# producer_example.py\nfrom kstreams import create_engine\nimport aiorun\nimport asyncio\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\nasync def produce(topic: str, value: bytes, key: str):\n    # This could be a complicated function or something like a FastAPI view\n    await stream_engine.send(topic, value=value, key=key)\n\n\nasync def start():\n    await stream_engine.start()\n    await produce()\n\n\nasync def shutdown(loop):\n    await stream_engine.stop()\n\n\ndef main():\n    aiorun.run(start(), stop_on_unhandled_errors=True, shutdown_callback=shutdown)\n

Then you could have a test_producer_example.py file to test the code:

# test_producer_example.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom producer_example import stream_engine, produce\n\nclient = TestStreamClient(stream_engine)\n\n\n@pytest.mark.asyncio\nasync def test_event_produced():\n    topic_name = \"local--kstreams\"\n    value = b'{\"message\": \"Hello world!\"}'\n    key = \"1\"\n\n    async with client:\n        await produce(topic=topic_name ,value=value, key=key) # use the produce code to send events\n\n        # check that the event was placed in a topic in a proper way\n        consumer_record = await client.get_event(topic_name=topic_name)\n\n        assert consumer_record.value == value\n        assert consumer_record.key == key\n

Note

Even thought the previous example is using a simple produce function, it shows what to do when the procuder code is encapsulated in other functions, for example a FastAPI view. Then you don't want to use client.send directly, just called the function that contains stream_engine.send(...)

"},{"location":"test_client/#defining-extra-topics","title":"Defining extra topics","text":"

For some uses cases is required to produce an event to a topic (target topic) after it was consumed (source topic). We are in control of the source topic because it has a stream associated with it and we want to consume events from it, however we might not be in control of the target topic.

How can we consume an event from the target topic which has not a stream associated and the topic will be created only when a send is reached? The answer is to pre define the extra topics before the test cycle has started. Let's take a look an example:

Let's imagine that we have the following code:

from kstreams import ConsumerRecord\n\nfrom .engine import stream_engine\n\n\n@stream_engine.stream(\"source-topic\", name=name)\nasync def consume(cr: ConsumerRecord) -> None:\n    # do something, for example save to db\n    await save_to_db(cr)\n\n    # then produce the event to the `target topic`\n    await stream_engine.send(\"target-topic\", value=cr.value, key=cr.key, headers=cr.headers)\n

Here we can test two things:

  1. Sending an event to the source-topic and check that the event has been consumed and saved to the DB
  2. Check that the event was send to the target-topic

Testing point 1 is straightforward:

import pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .engine import stream_engine\n\n\nclient = TestStreamClient(stream_engine)\nvalue = b'{\"message\": \"Hello world!\"}'\nkey = \"my-key\"\n\nasync with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    # check that the event was saved to the DB\n    assert await db.get(...)\n

However to test the point 2 we need more effort as the TestStreamClient is not aware of the target topic until it reaches the send inside the consume coroutine. If we try to get the target topic event inside the async with context we will have an error:

async with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    ...\n    # Let's check if it was received by the target topic\n    event = await client.get_event(topic_name=\"target-topic\")\n\n\nValueError: You might be trying to get the topic target-topic outside the `client async context` or trying to get an event from an empty topic target-topic. Make sure that the code is inside the async contextand the topic has events.\n

We can solve this with a delay (await asyncio.sleep(...)) inside the async with context to give time to the TestStreamClient to create the topic, however if the buisness logic inside the consume is slow we need to add more delay, then it will become a race condition.

To proper solve it, we can specify to the TestStreamClient the extra topics that we need during the test cycle.

import pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom .engine import stream_engine\n\n\n# tell the client to create the extra topics\nclient = TestStreamClient(stream_engine, topics=[\"target-topic\"])\nvalue = b'{\"message\": \"Hello world!\"}'\nkey = \"my-key\"\n\nasync with client:\n    # produce to the topic that has a stream\n    await client.send(\"source-topic\", value=value, key=key)\n\n    # check that the event was saved to the DB\n    assert await db.get(...)\n\n    # Let's check if it was received by the target topic\n    event = await client.get_event(topic_name=\"target-topic\")\n    assert event.value == value\n    assert event.key == key\n
"},{"location":"test_client/#topics-subscribed-by-pattern","title":"Topics subscribed by pattern","text":"

When a Stream is using pattern subscription it is not possible to know before hand how many topics the Stream will consume from. To solve this problem the topics must be pre defined using the extra topics features from the TestClient:

In the following example we have a Stream that will consume from topics that match the regular expression ^dev--customer-.*$, for example dev--customer-invoice and dev--customer-profile.

# app.py\nfrom kstreams import ConsumerRecord\n\nstream_engine = create_engine(title=\"my-stream-engine\")\n\n\n@stream_engine.stream(topics=\"^dev--customer-.*$\", subscribe_by_pattern=True)\nasync def stream(cr: ConsumerRecord):\n    if cr.topic == customer_invoice_topic:\n        assert cr.value == invoice_event\n    elif cr.topic == customer_profile_topic:\n        assert cr.value == profile_event\n    else:\n        raise ValueError(f\"Invalid topic {cr.topic}\")\n

Then to test our Stream, we need to pre define the topics:

# test_stream.py\nimport pytest\nfrom kstreams.test_utils import TestStreamClient\n\nfrom app import stream_engine\n\n\n@pytest.mark.asyncio\nasync def test_consume_events_topics_by_pattern():\n    \"\"\"\n    This test shows the possibility to subscribe to multiple topics using a pattern\n    \"\"\"\n    customer_invoice_topic = \"dev--customer-invoice\"\n    customer_profile_topic = \"dev--customer-profile\"\n\n    client = TestStreamClient(\n        stream_engine, topics=[customer_invoice_topic, customer_profile_topic]\n    )\n\n    async with client:\n        await client.send(customer_invoice_topic, value=b\"invoice-1\", key=\"1\")\n        await client.send(customer_profile_topic, value=b\"profile-1\", key=\"1\")\n\n        # give some time to consume all the events\n        await asyncio.sleep(0.1)\n        assert TopicManager.all_messages_consumed()\n
"},{"location":"test_client/#disabling-monitoring-during-testing","title":"Disabling monitoring during testing","text":"

Monitoring streams and producers is vital for streaming application but it requires extra effort. Sometimes during testing, monitoring is not required as we only want to focus on testing the buisness logic. In order to disable monitoring during testing use:

client = TestStreamClient(stream_engine, monitoring_enabled=False)\n
"},{"location":"utils/","title":"Utils","text":"

Utility functions

"},{"location":"utils/#kstreams.utils","title":"kstreams.utils","text":""},{"location":"utils/#kstreams.utils.create_ssl_context","title":"create_ssl_context(*, cafile=None, capath=None, cadata=None, certfile=None, keyfile=None, password=None, crlfile=None)","text":"

Wrapper of aiokafka.helpers.create_ssl_context with typehints.

Parameters:

Name Type Description Default cafile Optional[str]

Certificate Authority file path containing certificates used to sign broker certificates

None capath Optional[str]

Same as cafile, but points to a directory containing several CA certificates

None cadata Union[str, bytes, None]

Same as cafile, but instead contains already read data in either ASCII or bytes format

None certfile Optional[str]

optional filename of file in PEM format containing the client certificate, as well as any CA certificates needed to establish the certificate's authenticity

None keyfile Optional[str]

optional filename containing the client private key.

None password Optional[str]

optional password to be used when loading the certificate chain

None Source code in kstreams/utils.py
def create_ssl_context(\n    *,\n    cafile: Optional[str] = None,\n    capath: Optional[str] = None,\n    cadata: Union[str, bytes, None] = None,\n    certfile: Optional[str] = None,\n    keyfile: Optional[str] = None,\n    password: Optional[str] = None,\n    crlfile: Any = None,\n):\n    \"\"\"Wrapper of [aiokafka.helpers.create_ssl_context](\n        https://aiokafka.readthedocs.io/en/stable/api.html#helpers\n    )\n    with typehints.\n\n    Arguments:\n        cafile: Certificate Authority file path containing certificates\n            used to sign broker certificates\n        capath: Same as `cafile`, but points to a directory containing\n            several CA certificates\n        cadata: Same as `cafile`, but instead contains already\n            read data in either ASCII or bytes format\n        certfile: optional filename of file in PEM format containing\n            the client certificate, as well as any CA certificates needed to\n            establish the certificate's authenticity\n        keyfile: optional filename containing the client private key.\n        password: optional password to be used when loading the\n            certificate chain\n\n    \"\"\"\n    return aiokafka_create_ssl_context(\n        cafile=cafile,\n        capath=capath,\n        cadata=cadata,\n        certfile=certfile,\n        keyfile=keyfile,\n        password=password,\n        crlfile=crlfile,\n    )\n
"},{"location":"utils/#kstreams.utils.create_ssl_context_from_mem","title":"create_ssl_context_from_mem(*, certdata, keydata, password=None, cadata=None)","text":"

Create a SSL context from data on memory.

This makes it easy to read the certificates from environmental variables Usually the data is loaded from env variables.

Parameters:

Name Type Description Default cadata Optional[str]

certificates used to sign broker certificates provided as unicode str

None certdata str

the client certificate, as well as any CA certificates needed to establish the certificate's authenticity provided as unicode str

required keydata str

the client private key provided as unicode str

required password Optional[str]

optional password to be used when loading the certificate chain

None Source code in kstreams/utils.py
def create_ssl_context_from_mem(\n    *,\n    certdata: str,\n    keydata: str,\n    password: Optional[str] = None,\n    cadata: Optional[str] = None,\n) -> Optional[ssl.SSLContext]:\n    \"\"\"Create a SSL context from data on memory.\n\n    This makes it easy to read the certificates from environmental variables\n    Usually the data is loaded from env variables.\n\n    Arguments:\n        cadata: certificates used to sign broker certificates provided as unicode str\n        certdata: the client certificate, as well as any CA certificates needed to\n            establish the certificate's authenticity provided as unicode str\n        keydata: the client private key provided as unicode str\n        password: optional password to be used when loading the\n            certificate chain\n    \"\"\"\n    with contextlib.ExitStack() as stack:\n        cert_file = stack.enter_context(NamedTemporaryFile(suffix=\".crt\"))\n        key_file = stack.enter_context(NamedTemporaryFile(suffix=\".key\"))\n\n        # expecting unicode data, writing it as bytes to files as utf-8\n        cert_file.write(certdata.encode(\"utf-8\"))\n        cert_file.flush()\n\n        key_file.write(keydata.encode(\"utf-8\"))\n        key_file.flush()\n\n        ssl_context = ssl.create_default_context(cadata=cadata)\n        ssl_context.load_cert_chain(\n            cert_file.name, keyfile=key_file.name, password=password\n        )\n        return ssl_context\n    return None\n
"}]} \ No newline at end of file diff --git a/pr-preview/pr-248/stream/index.html b/pr-preview/pr-248/stream/index.html index a37fcabd..e2a4b0d2 100644 --- a/pr-preview/pr-248/stream/index.html +++ b/pr-preview/pr-248/stream/index.html @@ -1711,10 +1711,7 @@

Get many

Source code in kstreams/streams.py -
294
-295
-296
-297
+              
297
 298
 299
 300
@@ -1752,7 +1749,10 @@ 

Get many

332 333 334 -335
async def getmany(
+335
+336
+337
+338
async def getmany(
     self,
     partitions: typing.Optional[typing.List[TopicPartition]] = None,
     timeout_ms: int = 0,