Merge #32606

32606: kv: set sane default for kv.transaction.write_pipelining_max_batch_size r=nvanbenschoten a=nvanbenschoten Informs #32522. There is a tradeoff here between the overhead of waiting for consensus for a batch if we don't pipeline and proving that all of the writes in the batch succeed if we do pipeline. We set this default to a value which experimentally strikes a balance between the two costs. To determine the best value for this setting, I ran a three-node single-AZ AWS cluster with 4 vCPU nodes (`m5d.xlarge`). I modified KV to perform writes in an explicit txn and to run multiple statements. I then ran `kv0` with 8 DML statements per txn (a reasonable estimate for the average number of statements that an **explicit** txn runs) and adjusted the batch size of these statements from 1 to 256. This resulted in the following graph: ![image](https://user-images.githubusercontent.com/5438456/49038443-fc91e200-f18a-11e8-810d-1172821e63ea.png) We can see that the cross-over point where txn pipelining stops being beneficial is with batch sizes somewhere between 128 and 256 rows. Given this information, I set the default for kv.transaction.write_pipelining_max_batch_size` to 128. Of course, there are a lot of variables at play here: storage throughput, replication latency, node size, etc. I think the setup I used hits a reasonable middle ground with these. Release note: None Co-authored-by: Nathan VanBenschoten <[email protected]>
cockroachdb · Nov 26, 2018 · 1066a87 · 1066a87
2 parents a528536 + 52242a7
commit 1066a87
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 2 deletions.
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -40,7 +40,7 @@
 <tr><td><code>kv.transaction.max_intents_bytes</code></td><td>integer</td><td><code>256000</code></td><td>maximum number of bytes used to track write intents in transactions</td></tr>
 <tr><td><code>kv.transaction.max_refresh_spans_bytes</code></td><td>integer</td><td><code>256000</code></td><td>maximum number of bytes used to track refresh spans in serializable transactions</td></tr>
 <tr><td><code>kv.transaction.write_pipelining_enabled</code></td><td>boolean</td><td><code>true</code></td><td>if enabled, transactional writes are pipelined through Raft consensus</td></tr>
-<tr><td><code>kv.transaction.write_pipelining_max_batch_size</code></td><td>integer</td><td><code>0</code></td><td>if non-zero, defines that maximum size batch that will be pipelined through Raft consensus</td></tr>
+<tr><td><code>kv.transaction.write_pipelining_max_batch_size</code></td><td>integer</td><td><code>128</code></td><td>if non-zero, defines that maximum size batch that will be pipelined through Raft consensus</td></tr>
 <tr><td><code>rocksdb.min_wal_sync_interval</code></td><td>duration</td><td><code>0s</code></td><td>minimum duration between syncs of the RocksDB WAL</td></tr>
 <tr><td><code>schemachanger.lease.duration</code></td><td>duration</td><td><code>5m0s</code></td><td>the duration of a schema change lease</td></tr>
 <tr><td><code>schemachanger.lease.renew_fraction</code></td><td>float</td><td><code>0.4</code></td><td>the fraction of schemachanger.lease_duration remaining to trigger a renew of the lease</td></tr>

diff --git a/pkg/kv/txn_interceptor_pipeliner.go b/pkg/kv/txn_interceptor_pipeliner.go
@@ -36,7 +36,15 @@ var pipelinedWritesEnabled = settings.RegisterBoolSetting(
 var pipelinedWritesMaxBatchSize = settings.RegisterNonNegativeIntSetting(
 	"kv.transaction.write_pipelining_max_batch_size",
 	"if non-zero, defines that maximum size batch that will be pipelined through Raft consensus",
-	0,
+	// NB: there is a tradeoff between the overhead of synchronously waiting for
+	// consensus for a batch if we don't pipeline and proving that all of the
+	// writes in the batch succeed if we do pipeline. We set this default to a
+	// value which experimentally strikes a balance between the two costs.
+	//
+	// Notably, this is well below sql.max{Insert/Update/Upsert/Delete}BatchSize,
+	// so implicit SQL txns should never pipeline their writes - they should either
+	// hit the 1PC fast-path or should have batches which exceed this limit.
+	128,
 )
 
 // txnPipeliner is a txnInterceptor that pipelines transactional writes by using

diff --git a/pkg/kv/txn_interceptor_pipeliner_test.go b/pkg/kv/txn_interceptor_pipeliner_test.go
@@ -495,6 +495,9 @@ func TestTxnPipelinerManyWrites(t *testing.T) {
 	ctx := context.Background()
 	tp, mockSender := makeMockTxnPipeliner()
 
+	// Disable maxBatchSize limit.
+	pipelinedWritesMaxBatchSize.Override(&tp.st.SV, 0)
+
 	const writes = 2048
 	keyBuf := roachpb.Key(strings.Repeat("a", writes+1))
 	makeKey := func(i int) roachpb.Key { return keyBuf[:i+1] }