From fc4fc68f71daf322911fc3ff5f75e4286bb0b42e Mon Sep 17 00:00:00 2001 From: Wenli Wan Date: Thu, 5 Dec 2024 17:32:50 +0800 Subject: [PATCH] feat: Prevent retry buffer in AsyncProducer to go OOM This commit adds an optional configuration to Sarama's retry mechanism to limit the size of the retry buffer. The change addresses issues #1358 and #1372 by preventing unbounded memory growth when retries are backlogged or brokers are unresponsive. Key updates: - Added `Producer.Retry.MaxBufferLength` configuration to control the maximum number of messages stored in the retry buffer. - Implemented logic to handle overflow scenarios, ensuring non-flagged messages are either retried or sent to the errors channel, while flagged messages are re-queued. This enhancement provides a safeguard against OOM errors in high-throughput or unstable environments while maintaining backward compatibility (unlimited buffer by default). Signed-off-by: Wenli Wan --- async_producer.go | 26 ++++++++++++++ async_producer_test.go | 82 ++++++++++++++++++++++++++++++++++++++++++ config.go | 7 ++++ 3 files changed, 115 insertions(+) diff --git a/async_producer.go b/async_producer.go index a6fa3d4a2..c4b29ad38 100644 --- a/async_producer.go +++ b/async_producer.go @@ -13,6 +13,14 @@ import ( "github.com/rcrowley/go-metrics" ) +// ErrProducerRetryBufferOverflow is returned when the bridging retry buffer is full and OOM prevention needs to be applied. +var ErrProducerRetryBufferOverflow = errors.New("retry buffer is full: message discarded to " + + "prevent buffer overflow. Check other errors and logs to determine the cause of excessive retries") + +// minFunctionalRetryBufferLength is the lower limit of Producer.Retry.MaxBufferLength for it to function. +// Any non-zero maxBufferLength but less than this lower limit is pushed to the lower limit. +const minFunctionalRetryBufferLength = 5000 + // AsyncProducer publishes Kafka messages using a non-blocking API. It routes messages // to the correct broker for the provided topic-partition, refreshing metadata as appropriate, // and parses responses for errors. You must read from the Errors() channel or the @@ -1207,6 +1215,11 @@ func (bp *brokerProducer) handleError(sent *produceSet, err error) { // effectively a "bridge" between the flushers and the dispatcher in order to avoid deadlock // based on https://godoc.org/github.com/eapache/channels#InfiniteChannel func (p *asyncProducer) retryHandler() { + maxBufferSize := p.conf.Producer.Retry.MaxBufferLength + if 0 < maxBufferSize && maxBufferSize < minFunctionalRetryBufferLength { + maxBufferSize = minFunctionalRetryBufferLength + } + var msg *ProducerMessage buf := queue.New() @@ -1227,6 +1240,19 @@ func (p *asyncProducer) retryHandler() { } buf.Add(msg) + + if maxBufferSize > 0 && buf.Length() >= maxBufferSize { + msgToHandle := buf.Peek().(*ProducerMessage) + if msgToHandle.flags == 0 { + select { + case p.input <- msgToHandle: + buf.Remove() + default: + buf.Remove() + p.returnError(msgToHandle, ErrProducerRetryBufferOverflow) + } + } + } } } diff --git a/async_producer_test.go b/async_producer_test.go index c192235cc..3cd5d97d6 100644 --- a/async_producer_test.go +++ b/async_producer_test.go @@ -2,6 +2,7 @@ package sarama import ( "errors" + "fmt" "log" "math" "os" @@ -2244,6 +2245,87 @@ func TestTxnCanAbort(t *testing.T) { require.NoError(t, err) } +func TestPreventRetryBufferOverflow(t *testing.T) { + leader := NewMockBroker(t, 1) + defer leader.Close() + topic := "test-topic" + + metadataRequestHandlerFunc := func(req *request) (res encoderWithHeader) { + r := new(MetadataResponse) + r.AddBroker(leader.Addr(), leader.BrokerID()) + r.AddTopicPartition(topic, 0, leader.BrokerID(), nil, nil, nil, ErrNoError) + return r + } + + produceRequestHandlerFunc := func(req *request) (res encoderWithHeader) { + r := new(ProduceResponse) + r.AddTopicPartition(topic, 0, ErrNotLeaderForPartition) + return r + } + + leader.SetHandlerFuncByMap(map[string]requestHandlerFunc{ + "ProduceRequest": produceRequestHandlerFunc, + "MetadataRequest": metadataRequestHandlerFunc, + }) + + config := NewTestConfig() + config.Producer.Flush.MaxMessages = 1 + config.Producer.Retry.MaxBufferLength = minFunctionalRetryBufferLength + config.Producer.Return.Successes = true + + producer, err := NewAsyncProducer([]string{leader.Addr()}, config) + if err != nil { + t.Fatal(err) + } + + var ( + wg sync.WaitGroup + successes, producerErrors int + errorFound bool + ) + + wg.Add(1) + go func() { + defer wg.Done() + for range producer.Successes() { + successes++ + } + }() + + wg.Add(1) + go func() { + defer wg.Done() + for errMsg := range producer.Errors() { + if errors.Is(errMsg.Err, ErrProducerRetryBufferOverflow) { + errorFound = true + } + producerErrors++ + } + }() + + numMessages := 100000 + for i := 0; i < numMessages; i++ { + kv := StringEncoder(fmt.Sprintf("%d", i)) + producer.Input() <- &ProducerMessage{ + Topic: topic, + Key: kv, + Value: kv, + Metadata: i, + } + } + + producer.AsyncClose() + wg.Wait() + + if successes+producerErrors < numMessages { + t.Error("Expected all messages to be processed") + } + + if !errorFound { + t.Error("Expected at least one error matching ErrProducerRetryBufferOverflow") + } +} + // This example shows how to use the producer while simultaneously // reading the Errors channel to know about any failures. func ExampleAsyncProducer_select() { diff --git a/config.go b/config.go index f2f197887..78d4758af 100644 --- a/config.go +++ b/config.go @@ -269,6 +269,13 @@ type Config struct { // more sophisticated backoff strategies. This takes precedence over // `Backoff` if set. BackoffFunc func(retries, maxRetries int) time.Duration + // The maximum length of the bridging buffer between `input` and `retries` channels + // in AsyncProducer#retryHandler. + // The limit is to prevent this buffer from overflowing or causing OOM. + // Defaults to 0 for unlimited. + // Any value between 0 and 5000 is pushed to 5000. + // A zero or negative value indicates unlimited. + MaxBufferLength int } // Interceptors to be called when the producer dispatcher reads the