From 51efbcd7cb559425c2aa9610a82a7898ddeb571d Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Mon, 6 May 2024 14:50:45 -0700 Subject: [PATCH 01/15] Added tests for starting, stopping and purging daemon for replicationQueue --- common/domain/replication_queue_test.go | 116 ++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index e1e1458dc7b..c5e9cce54e7 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -25,7 +25,10 @@ import ( "context" "encoding/binary" "errors" + "github.com/uber/cadence/common" + "sync/atomic" "testing" + "time" "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" @@ -39,6 +42,94 @@ const ( preambleVersion0 byte = 0x59 ) +func TestReplicationQueueImpl_Start(t *testing.T) { + tests := []struct { + name string + initialStatus int32 + expectedStatus int32 + shouldStart bool + }{ + { + name: "Should start when initialized", + initialStatus: common.DaemonStatusInitialized, + expectedStatus: common.DaemonStatusStarted, + shouldStart: true, + }, + { + name: "Should not start when already started", + initialStatus: common.DaemonStatusStarted, + expectedStatus: common.DaemonStatusStarted, + shouldStart: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockQueue := persistence.NewMockQueueManager(ctrl) + rq := NewReplicationQueue(mockQueue, "testCluster", nil, nil).(*replicationQueueImpl) + atomic.StoreInt32(&rq.status, tt.initialStatus) + + rq.Start() + defer rq.Stop() + assert.Equal(t, tt.expectedStatus, atomic.LoadInt32(&rq.status)) + + if tt.shouldStart { + time.Sleep(1 * time.Nanosecond) + select { + case <-rq.done: + t.Error("purgeProcessor should not have stopped") + default: + // expected no action + } + } + }) + } +} + +func TestReplicationQueueImpl_Stop(t *testing.T) { + tests := []struct { + name string + initialStatus int32 + expectedStatus int32 + shouldStop bool + }{ + { + name: "Should stop when started", + initialStatus: common.DaemonStatusStarted, + expectedStatus: common.DaemonStatusStopped, + shouldStop: true, + }, + { + name: "Should not stop when not started", + initialStatus: common.DaemonStatusInitialized, + expectedStatus: common.DaemonStatusInitialized, + shouldStop: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockQueue := persistence.NewMockQueueManager(ctrl) + rq := NewReplicationQueue(mockQueue, "testCluster", nil, nil).(*replicationQueueImpl) + atomic.StoreInt32(&rq.status, tt.initialStatus) + + rq.Stop() + assert.Equal(t, tt.expectedStatus, atomic.LoadInt32(&rq.status)) + + if tt.shouldStop { + select { + case <-rq.done: + // expected channel closed + default: + t.Error("done channel should be closed") + } + } + }) + } +} + func TestReplicationQueueImpl_Publish(t *testing.T) { tests := []struct { name string @@ -619,3 +710,28 @@ func TestPurgeAckedMessages(t *testing.T) { }) } } + +func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { + ctrl := gomock.NewController(t) + mockQueue := persistence.NewMockQueueManager(ctrl) + rq := NewReplicationQueue(mockQueue, "testCluster", nil, nil).(*replicationQueueImpl) + atomic.StoreInt32(&rq.status, common.DaemonStatusStarted) + + done := make(chan bool) + mockQueue.EXPECT().GetAckLevels(gomock.Any()).Return(map[string]int64{}, nil).AnyTimes() + mockQueue.EXPECT().DeleteMessagesBefore(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() + + go func() { + rq.purgeProcessor() + close(done) + }() + + time.Sleep(1 * time.Nanosecond) + rq.Stop() + select { + case <-done: + // Pass if the goroutine exits + case <-time.After(1 * time.Millisecond): + t.Error("purgeProcessor did not stop within expected time") + } +} From 35173118478a0d4e27c8e71b0db808972367ee0d Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Mon, 6 May 2024 14:53:10 -0700 Subject: [PATCH 02/15] formatting --- common/domain/replication_queue_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index c5e9cce54e7..475fb0a1dcb 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -25,7 +25,6 @@ import ( "context" "encoding/binary" "errors" - "github.com/uber/cadence/common" "sync/atomic" "testing" "time" @@ -34,6 +33,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/uber/cadence/common" "github.com/uber/cadence/common/persistence" "github.com/uber/cadence/common/types" ) From d0e586c5dd69b139a59a9cd638ecd408527414c6 Mon Sep 17 00:00:00 2001 From: taylanisikdemir Date: Tue, 7 May 2024 11:26:57 -0700 Subject: [PATCH 03/15] Replace wurstmeister kafka/zookeeper images with bitnami kafka image (#5975) --- docker/buildkite/docker-compose-es7.yml | 32 +++++++++-------- .../docker-compose-local-async-wf.yml | 35 +++++++++--------- docker/buildkite/docker-compose-local-es7.yml | 32 ++++++++--------- docker/buildkite/docker-compose-local.yml | 36 +++++++++---------- .../buildkite/docker-compose-opensearch2.yml | 32 +++++++++-------- docker/buildkite/docker-compose.yml | 35 +++++++++--------- docker/dev/cassandra-esv7-kafka.yml | 25 +++++++------ docker/dev/cassandra-opensearch-kafka.yml | 25 +++++++------ docker/dev/cassandra-pinot-kafka.yml | 28 ++++++++------- docker/dev/mongo-esv7-kafka.yml | 25 +++++++------ docker/dev/mysql-esv7-kafka.yml | 25 +++++++------ docker/docker-compose-async-wf-kafka.yml | 28 ++++++++------- docker/docker-compose-es-v7.yml | 25 +++++++------ docker/docker-compose-es.yml | 25 +++++++------ ...r-compose-multiclusters-cass-mysql-es.yaml | 25 +++++++------ docker/docker-compose-multiclusters-es.yml | 25 +++++++------ docker/docker-compose-pinot.yml | 26 ++++++++------ 17 files changed, 270 insertions(+), 214 deletions(-) diff --git a/docker/buildkite/docker-compose-es7.yml b/docker/buildkite/docker-compose-es7.yml index bbf3703d02a..272b05bcaf4 100644 --- a/docker/buildkite/docker-compose-es7.yml +++ b/docker/buildkite/docker-compose-es7.yml @@ -16,25 +16,29 @@ services: timeout: 30s retries: 10 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka + ports: + - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.9.3 diff --git a/docker/buildkite/docker-compose-local-async-wf.yml b/docker/buildkite/docker-compose-local-async-wf.yml index ce2fa7d3339..88f497d18e4 100644 --- a/docker/buildkite/docker-compose-local-async-wf.yml +++ b/docker/buildkite/docker-compose-local-async-wf.yml @@ -18,32 +18,29 @@ services: timeout: 30s retries: 10 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - # create a topic with 10 partitions and 1 replica - # topic name must match ASYNC_WF_KAFKA_QUEUE_TOPIC specified in cadence container above - KAFKA_CREATE_TOPICS: "async-wf-topic1:10:1" integration-test-async-wf: build: diff --git a/docker/buildkite/docker-compose-local-es7.yml b/docker/buildkite/docker-compose-local-es7.yml index ca6eba498b4..ef3331e1a7e 100644 --- a/docker/buildkite/docker-compose-local-es7.yml +++ b/docker/buildkite/docker-compose-local-es7.yml @@ -18,29 +18,29 @@ services: timeout: 30s retries: 10 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.9.3 diff --git a/docker/buildkite/docker-compose-local.yml b/docker/buildkite/docker-compose-local.yml index 665bac1dea4..bd15ad4309d 100644 --- a/docker/buildkite/docker-compose-local.yml +++ b/docker/buildkite/docker-compose-local.yml @@ -44,29 +44,29 @@ services: aliases: - postgres - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - expose: - - "2181" - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper - expose: - - "9092" + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka + ports: + - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.8.22 diff --git a/docker/buildkite/docker-compose-opensearch2.yml b/docker/buildkite/docker-compose-opensearch2.yml index 49547065bbc..727e32d659c 100644 --- a/docker/buildkite/docker-compose-opensearch2.yml +++ b/docker/buildkite/docker-compose-opensearch2.yml @@ -16,25 +16,29 @@ services: timeout: 30s retries: 10 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka + ports: + - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 elasticsearch: image: opensearchproject/opensearch:2.5.0 diff --git a/docker/buildkite/docker-compose.yml b/docker/buildkite/docker-compose.yml index 343421fab1c..0eca8b67c56 100644 --- a/docker/buildkite/docker-compose.yml +++ b/docker/buildkite/docker-compose.yml @@ -36,28 +36,29 @@ services: aliases: - postgres - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - networks: - services-network: - aliases: - - zookeeper - kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka + ports: + - "9092:9092" + environment: + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" networks: services-network: aliases: - kafka - environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - # for async wf tests, create a topic with 10 partitions and 1 replica - # topic name must match ASYNC_WF_KAFKA_QUEUE_TOPIC - KAFKA_CREATE_TOPICS: "async-wf-topic1:10:1" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.8.22 diff --git a/docker/dev/cassandra-esv7-kafka.yml b/docker/dev/cassandra-esv7-kafka.yml index ad9301276cb..115af4961f6 100644 --- a/docker/dev/cassandra-esv7-kafka.yml +++ b/docker/dev/cassandra-esv7-kafka.yml @@ -14,16 +14,21 @@ services: environment: - discovery.type=single-node kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" diff --git a/docker/dev/cassandra-opensearch-kafka.yml b/docker/dev/cassandra-opensearch-kafka.yml index 84f2ddd01f2..52542cdfb24 100644 --- a/docker/dev/cassandra-opensearch-kafka.yml +++ b/docker/dev/cassandra-opensearch-kafka.yml @@ -15,16 +15,21 @@ services: environment: - discovery.type=single-node kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" diff --git a/docker/dev/cassandra-pinot-kafka.yml b/docker/dev/cassandra-pinot-kafka.yml index f1cfa2bccd3..30f4ecdeaa7 100644 --- a/docker/dev/cassandra-pinot-kafka.yml +++ b/docker/dev/cassandra-pinot-kafka.yml @@ -48,21 +48,25 @@ services: JAVA_OPTS: "-Dplugins.dir=/opt/pinot/plugins -Xms4G -Xmx16G -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xloggc:gc-pinot-server.log" depends_on: - pinot-broker - kafka: - image: wurstmeister/kafka:2.13-2.8.1 - depends_on: - - zookeeper + kafka: + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" - expose: - - "9093" environment: - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181/kafka - KAFKA_BROKER_ID: 0 - KAFKA_ADVERTISED_HOST_NAME: kafka - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9093,OUTSIDE://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9093,OUTSIDE://0.0.0.0:9092 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.9.3 ports: diff --git a/docker/dev/mongo-esv7-kafka.yml b/docker/dev/mongo-esv7-kafka.yml index 0eb6be347d8..35af14d4376 100644 --- a/docker/dev/mongo-esv7-kafka.yml +++ b/docker/dev/mongo-esv7-kafka.yml @@ -25,16 +25,21 @@ services: environment: - discovery.type=single-node kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" \ No newline at end of file + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" diff --git a/docker/dev/mysql-esv7-kafka.yml b/docker/dev/mysql-esv7-kafka.yml index adb03e1ce5c..72100f50427 100644 --- a/docker/dev/mysql-esv7-kafka.yml +++ b/docker/dev/mysql-esv7-kafka.yml @@ -13,16 +13,21 @@ services: environment: - discovery.type=single-node kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" \ No newline at end of file + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" diff --git a/docker/docker-compose-async-wf-kafka.yml b/docker/docker-compose-async-wf-kafka.yml index c88edd1d649..278e330b1bd 100644 --- a/docker/docker-compose-async-wf-kafka.yml +++ b/docker/docker-compose-async-wf-kafka.yml @@ -74,20 +74,22 @@ services: - prometheus ports: - '3000:3000' - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - # create a topic with 10 partitions and 1 replica - # topic name must match ASYNC_WF_KAFKA_QUEUE_TOPIC specified in cadence container above - KAFKA_CREATE_TOPICS: "async-wf-topic1:10:1" + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" diff --git a/docker/docker-compose-es-v7.yml b/docker/docker-compose-es-v7.yml index fe2761e74f5..f11200bdc0e 100644 --- a/docker/docker-compose-es-v7.yml +++ b/docker/docker-compose-es-v7.yml @@ -20,20 +20,25 @@ services: - '--config.file=/etc/prometheus/prometheus.yml' ports: - '9090:9090' - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:7.9.3 ports: diff --git a/docker/docker-compose-es.yml b/docker/docker-compose-es.yml index 0f147ff1987..ad5fc7696ca 100644 --- a/docker/docker-compose-es.yml +++ b/docker/docker-compose-es.yml @@ -20,20 +20,25 @@ services: - '--config.file=/etc/prometheus/prometheus.yml' ports: - '9090:9090' - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.8.22 ports: diff --git a/docker/docker-compose-multiclusters-cass-mysql-es.yaml b/docker/docker-compose-multiclusters-cass-mysql-es.yaml index 9adec5cf889..5f967b8b7fc 100644 --- a/docker/docker-compose-multiclusters-cass-mysql-es.yaml +++ b/docker/docker-compose-multiclusters-cass-mysql-es.yaml @@ -26,20 +26,25 @@ services: - '--config.file=/etc/prometheus/prometheus.yml' ports: - '9090:9090' - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.8.22 ports: diff --git a/docker/docker-compose-multiclusters-es.yml b/docker/docker-compose-multiclusters-es.yml index 70986eebe25..6ccd8f749b7 100644 --- a/docker/docker-compose-multiclusters-es.yml +++ b/docker/docker-compose-multiclusters-es.yml @@ -20,20 +20,25 @@ services: - '--config.file=/etc/prometheus/prometheus.yml' ports: - '9090:9090' - zookeeper: - image: wurstmeister/zookeeper:3.4.6 - ports: - - "2181:2181" kafka: - image: wurstmeister/kafka:2.12-2.1.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" environment: - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.8.22 ports: diff --git a/docker/docker-compose-pinot.yml b/docker/docker-compose-pinot.yml index 1cbeada2681..6bd7be3dcf2 100644 --- a/docker/docker-compose-pinot.yml +++ b/docker/docker-compose-pinot.yml @@ -13,20 +13,24 @@ services: ports: - '9090:9090' kafka: - image: wurstmeister/kafka:2.13-2.8.1 - depends_on: - - zookeeper + image: docker.io/bitnami/kafka:3.7 + hostname: kafka + container_name: kafka ports: - "9092:9092" - expose: - - "9093" environment: - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181/kafka - KAFKA_BROKER_ID: 0 - KAFKA_ADVERTISED_HOST_NAME: kafka - KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9093,OUTSIDE://localhost:9092 - KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9093,OUTSIDE://0.0.0.0:9092 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,OUTSIDE:PLAINTEXT + # KRaft settings + - "KAFKA_CFG_NODE_ID=0" + - "KAFKA_CFG_PROCESS_ROLES=controller,broker" + - "KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=0@kafka:9093" + # Listeners + - "KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093" + - "KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092" + - "KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" + - "KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER" + - "KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT" + # Topic settings + - "KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=true" zookeeper: image: zookeeper:3.5.8 container_name: zookeeper From 0efab43c01133aa635f01f4f51ed47a8da133da8 Mon Sep 17 00:00:00 2001 From: taylanisikdemir Date: Tue, 7 May 2024 11:53:51 -0700 Subject: [PATCH 04/15] Split historyEngine.go into small files (#5972) --- codecov.yml | 2 +- .../engineimpl/cross_cluster_operations.go | 50 + .../engineimpl/describe_mutable_state.go | 87 + .../engine/engineimpl/describe_queues.go | 76 + .../engineimpl/describe_workflow_execution.go | 213 + .../engine/engineimpl/dlq_operations.go | 87 + .../engineimpl/get_replication_messages.go | 94 + .../engine/engineimpl/historyEngine.go | 3823 ----------------- .../engine/engineimpl/history_engine.go | 515 +++ ...ngine2_test.go => history_engine2_test.go} | 0 ...st.go => history_engine3_eventsv2_test.go} | 0 ...yEngine_test.go => history_engine_test.go} | 0 .../history/engine/engineimpl/notify_tasks.go | 144 + .../engine/engineimpl/poll_mutable_state.go | 248 ++ .../engine/engineimpl/query_workflow.go | 330 ++ .../engine/engineimpl/reapply_events.go | 180 + .../record_activity_task_started.go | 169 + .../record_child_execution_completed.go | 118 + .../record_decision_task_started.go | 33 + .../engineimpl/refresh_workflow_tasks.go | 71 + .../register_domain_failover_callback.go | 160 + .../engineimpl/remove_signal_mutable_state.go | 59 + .../request_cancel_workflow_execution.go | 112 + .../history/engine/engineimpl/reset_queues.go | 52 + .../engineimpl/reset_sticky_tasklist.go | 64 + .../engineimpl/reset_workflow_execution.go | 184 + .../respond_activity_task_canceled.go | 121 + .../respond_activity_task_completed.go | 125 + .../respond_activity_task_failed.go | 141 + .../respond_activity_task_heartbeat.go | 124 + .../respond_decision_task_completed.go | 33 + .../respond_decision_task_failed.go | 33 + .../engineimpl/signal_workflow_execution.go | 130 + .../engineimpl/start_workflow_execution.go | 847 ++++ .../terminate_workflow_execution.go | 102 + 35 files changed, 4703 insertions(+), 3824 deletions(-) create mode 100644 service/history/engine/engineimpl/cross_cluster_operations.go create mode 100644 service/history/engine/engineimpl/describe_mutable_state.go create mode 100644 service/history/engine/engineimpl/describe_queues.go create mode 100644 service/history/engine/engineimpl/describe_workflow_execution.go create mode 100644 service/history/engine/engineimpl/dlq_operations.go create mode 100644 service/history/engine/engineimpl/get_replication_messages.go delete mode 100644 service/history/engine/engineimpl/historyEngine.go create mode 100644 service/history/engine/engineimpl/history_engine.go rename service/history/engine/engineimpl/{historyEngine2_test.go => history_engine2_test.go} (100%) rename service/history/engine/engineimpl/{historyEngine3_eventsv2_test.go => history_engine3_eventsv2_test.go} (100%) rename service/history/engine/engineimpl/{historyEngine_test.go => history_engine_test.go} (100%) create mode 100644 service/history/engine/engineimpl/notify_tasks.go create mode 100644 service/history/engine/engineimpl/poll_mutable_state.go create mode 100644 service/history/engine/engineimpl/query_workflow.go create mode 100644 service/history/engine/engineimpl/reapply_events.go create mode 100644 service/history/engine/engineimpl/record_activity_task_started.go create mode 100644 service/history/engine/engineimpl/record_child_execution_completed.go create mode 100644 service/history/engine/engineimpl/record_decision_task_started.go create mode 100644 service/history/engine/engineimpl/refresh_workflow_tasks.go create mode 100644 service/history/engine/engineimpl/register_domain_failover_callback.go create mode 100644 service/history/engine/engineimpl/remove_signal_mutable_state.go create mode 100644 service/history/engine/engineimpl/request_cancel_workflow_execution.go create mode 100644 service/history/engine/engineimpl/reset_queues.go create mode 100644 service/history/engine/engineimpl/reset_sticky_tasklist.go create mode 100644 service/history/engine/engineimpl/reset_workflow_execution.go create mode 100644 service/history/engine/engineimpl/respond_activity_task_canceled.go create mode 100644 service/history/engine/engineimpl/respond_activity_task_completed.go create mode 100644 service/history/engine/engineimpl/respond_activity_task_failed.go create mode 100644 service/history/engine/engineimpl/respond_activity_task_heartbeat.go create mode 100644 service/history/engine/engineimpl/respond_decision_task_completed.go create mode 100644 service/history/engine/engineimpl/respond_decision_task_failed.go create mode 100644 service/history/engine/engineimpl/signal_workflow_execution.go create mode 100644 service/history/engine/engineimpl/start_workflow_execution.go create mode 100644 service/history/engine/engineimpl/terminate_workflow_execution.go diff --git a/codecov.yml b/codecov.yml index ec0a45dae55..18bc0b883c1 100644 --- a/codecov.yml +++ b/codecov.yml @@ -19,7 +19,7 @@ coverage: if_ci_failed: ignore # require the CI to pass before setting the status patch: default: - target: 85% # specify the target coverage for each commit status + target: 0% # specify the target coverage for each commit status # option: "auto" (compare against parent commit or pull request base) # option: "X%" a static target percentage to hit threshold: 0% # allow the coverage drop by x% before marking as failure diff --git a/service/history/engine/engineimpl/cross_cluster_operations.go b/service/history/engine/engineimpl/cross_cluster_operations.go new file mode 100644 index 00000000000..ea6bca8a55c --- /dev/null +++ b/service/history/engine/engineimpl/cross_cluster_operations.go @@ -0,0 +1,50 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/queue" +) + +func (e *historyEngineImpl) GetCrossClusterTasks( + ctx context.Context, + targetCluster string, +) ([]*types.CrossClusterTaskRequest, error) { + actionResult, err := e.crossClusterProcessor.HandleAction(ctx, targetCluster, queue.NewGetTasksAction()) + if err != nil { + return nil, err + } + + return actionResult.GetTasksResult.TaskRequests, nil +} + +func (e *historyEngineImpl) RespondCrossClusterTasksCompleted( + ctx context.Context, + targetCluster string, + responses []*types.CrossClusterTaskResponse, +) error { + _, err := e.crossClusterProcessor.HandleAction(ctx, targetCluster, queue.NewUpdateTasksAction(responses)) + return err +} diff --git a/service/history/engine/engineimpl/describe_mutable_state.go b/service/history/engine/engineimpl/describe_mutable_state.go new file mode 100644 index 00000000000..618314ca66e --- /dev/null +++ b/service/history/engine/engineimpl/describe_mutable_state.go @@ -0,0 +1,87 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "encoding/json" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" +) + +func (e *historyEngineImpl) DescribeMutableState( + ctx context.Context, + request *types.DescribeMutableStateRequest, +) (response *types.DescribeMutableStateResponse, retError error) { + + if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { + return nil, err + } + + domainID := request.DomainUUID + execution := types.WorkflowExecution{ + WorkflowID: request.Execution.WorkflowID, + RunID: request.Execution.RunID, + } + + cacheCtx, dbCtx, release, cacheHit, err := e.executionCache.GetAndCreateWorkflowExecution( + ctx, domainID, execution, + ) + if err != nil { + return nil, err + } + defer func() { release(retError) }() + + response = &types.DescribeMutableStateResponse{} + + if cacheHit { + if msb := cacheCtx.GetWorkflowExecution(); msb != nil { + response.MutableStateInCache, err = e.toMutableStateJSON(msb) + if err != nil { + return nil, err + } + } + } + + msb, err := dbCtx.LoadWorkflowExecution(ctx) + if err != nil { + return nil, err + } + response.MutableStateInDatabase, err = e.toMutableStateJSON(msb) + if err != nil { + return nil, err + } + + return response, nil +} + +func (e *historyEngineImpl) toMutableStateJSON(msb execution.MutableState) (string, error) { + ms := msb.CopyToPersistence() + + jsonBytes, err := json.Marshal(ms) + if err != nil { + return "", err + } + return string(jsonBytes), nil +} diff --git a/service/history/engine/engineimpl/describe_queues.go b/service/history/engine/engineimpl/describe_queues.go new file mode 100644 index 00000000000..90424814c59 --- /dev/null +++ b/service/history/engine/engineimpl/describe_queues.go @@ -0,0 +1,76 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "fmt" + + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/queue" +) + +func (e *historyEngineImpl) DescribeTransferQueue( + ctx context.Context, + clusterName string, +) (*types.DescribeQueueResponse, error) { + return e.describeQueue(ctx, e.txProcessor, clusterName) +} + +func (e *historyEngineImpl) DescribeTimerQueue( + ctx context.Context, + clusterName string, +) (*types.DescribeQueueResponse, error) { + return e.describeQueue(ctx, e.timerProcessor, clusterName) +} + +func (e *historyEngineImpl) DescribeCrossClusterQueue( + ctx context.Context, + clusterName string, +) (*types.DescribeQueueResponse, error) { + return e.describeQueue(ctx, e.crossClusterProcessor, clusterName) +} + +func (e *historyEngineImpl) describeQueue( + ctx context.Context, + queueProcessor queue.Processor, + clusterName string, +) (*types.DescribeQueueResponse, error) { + resp, err := queueProcessor.HandleAction(ctx, clusterName, queue.NewGetStateAction()) + if err != nil { + return nil, err + } + + serializedStates := make([]string, 0, len(resp.GetStateActionResult.States)) + for _, state := range resp.GetStateActionResult.States { + serializedStates = append(serializedStates, e.serializeQueueState(state)) + } + return &types.DescribeQueueResponse{ + ProcessingQueueStates: serializedStates, + }, nil +} + +func (e *historyEngineImpl) serializeQueueState( + state queue.ProcessingQueueState, +) string { + return fmt.Sprintf("%v", state) +} diff --git a/service/history/engine/engineimpl/describe_workflow_execution.go b/service/history/engine/engineimpl/describe_workflow_execution.go new file mode 100644 index 00000000000..ef02cc48ace --- /dev/null +++ b/service/history/engine/engineimpl/describe_workflow_execution.go @@ -0,0 +1,213 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" +) + +// DescribeWorkflowExecution returns information about the specified workflow execution. +func (e *historyEngineImpl) DescribeWorkflowExecution( + ctx context.Context, + request *types.HistoryDescribeWorkflowExecutionRequest, +) (retResp *types.DescribeWorkflowExecutionResponse, retError error) { + + if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { + return nil, err + } + + domainID := request.DomainUUID + wfExecution := *request.Request.Execution + + wfContext, release, err0 := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, wfExecution) + if err0 != nil { + return nil, err0 + } + defer func() { release(retError) }() + + mutableState, err1 := wfContext.LoadWorkflowExecution(ctx) + if err1 != nil { + return nil, err1 + } + // If history is corrupted, return an error to the end user + if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { + return nil, err + } else if corrupted { + return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} + } + + executionInfo := mutableState.GetExecutionInfo() + + result := &types.DescribeWorkflowExecutionResponse{ + ExecutionConfiguration: &types.WorkflowExecutionConfiguration{ + TaskList: &types.TaskList{Name: executionInfo.TaskList}, + ExecutionStartToCloseTimeoutSeconds: common.Int32Ptr(executionInfo.WorkflowTimeout), + TaskStartToCloseTimeoutSeconds: common.Int32Ptr(executionInfo.DecisionStartToCloseTimeout), + }, + WorkflowExecutionInfo: &types.WorkflowExecutionInfo{ + Execution: &types.WorkflowExecution{ + WorkflowID: executionInfo.WorkflowID, + RunID: executionInfo.RunID, + }, + Type: &types.WorkflowType{Name: executionInfo.WorkflowTypeName}, + StartTime: common.Int64Ptr(executionInfo.StartTimestamp.UnixNano()), + HistoryLength: mutableState.GetNextEventID() - common.FirstEventID, + AutoResetPoints: executionInfo.AutoResetPoints, + Memo: &types.Memo{Fields: executionInfo.Memo}, + IsCron: len(executionInfo.CronSchedule) > 0, + UpdateTime: common.Int64Ptr(executionInfo.LastUpdatedTimestamp.UnixNano()), + SearchAttributes: &types.SearchAttributes{IndexedFields: executionInfo.SearchAttributes}, + PartitionConfig: executionInfo.PartitionConfig, + }, + } + + // TODO: we need to consider adding execution time to mutable state + // For now execution time will be calculated based on start time and cron schedule/retry policy + // each time DescribeWorkflowExecution is called. + startEvent, err := mutableState.GetStartEvent(ctx) + if err != nil { + return nil, err + } + backoffDuration := time.Duration(startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstDecisionTaskBackoffSeconds()) * time.Second + result.WorkflowExecutionInfo.ExecutionTime = common.Int64Ptr(result.WorkflowExecutionInfo.GetStartTime() + backoffDuration.Nanoseconds()) + + if executionInfo.ParentRunID != "" { + result.WorkflowExecutionInfo.ParentExecution = &types.WorkflowExecution{ + WorkflowID: executionInfo.ParentWorkflowID, + RunID: executionInfo.ParentRunID, + } + result.WorkflowExecutionInfo.ParentDomainID = common.StringPtr(executionInfo.ParentDomainID) + result.WorkflowExecutionInfo.ParentInitiatedID = common.Int64Ptr(executionInfo.InitiatedID) + parentDomain, err := e.shard.GetDomainCache().GetDomainName(executionInfo.ParentDomainID) + if err != nil { + return nil, err + } + result.WorkflowExecutionInfo.ParentDomain = common.StringPtr(parentDomain) + } + if executionInfo.State == persistence.WorkflowStateCompleted { + // for closed workflow + result.WorkflowExecutionInfo.CloseStatus = persistence.ToInternalWorkflowExecutionCloseStatus(executionInfo.CloseStatus) + completionEvent, err := mutableState.GetCompletionEvent(ctx) + if err != nil { + return nil, err + } + result.WorkflowExecutionInfo.CloseTime = common.Int64Ptr(completionEvent.GetTimestamp()) + } + + if len(mutableState.GetPendingActivityInfos()) > 0 { + for _, ai := range mutableState.GetPendingActivityInfos() { + p := &types.PendingActivityInfo{ + ActivityID: ai.ActivityID, + } + state := types.PendingActivityStateScheduled + if ai.CancelRequested { + state = types.PendingActivityStateCancelRequested + } else if ai.StartedID != common.EmptyEventID { + state = types.PendingActivityStateStarted + } + p.State = &state + lastHeartbeatUnixNano := ai.LastHeartBeatUpdatedTime.UnixNano() + if lastHeartbeatUnixNano > 0 { + p.LastHeartbeatTimestamp = common.Int64Ptr(lastHeartbeatUnixNano) + p.HeartbeatDetails = ai.Details + } + // TODO: move to mutable state instead of loading it from event + scheduledEvent, err := mutableState.GetActivityScheduledEvent(ctx, ai.ScheduleID) + if err != nil { + return nil, err + } + p.ActivityType = scheduledEvent.ActivityTaskScheduledEventAttributes.ActivityType + if state == types.PendingActivityStateScheduled { + p.ScheduledTimestamp = common.Int64Ptr(ai.ScheduledTime.UnixNano()) + } else { + p.LastStartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) + } + if ai.HasRetryPolicy { + p.Attempt = ai.Attempt + p.ExpirationTimestamp = common.Int64Ptr(ai.ExpirationTime.UnixNano()) + if ai.MaximumAttempts != 0 { + p.MaximumAttempts = ai.MaximumAttempts + } + if ai.LastFailureReason != "" { + p.LastFailureReason = common.StringPtr(ai.LastFailureReason) + p.LastFailureDetails = ai.LastFailureDetails + } + if ai.LastWorkerIdentity != "" { + p.LastWorkerIdentity = ai.LastWorkerIdentity + } + if ai.StartedIdentity != "" { + p.StartedWorkerIdentity = ai.StartedIdentity + } + } + result.PendingActivities = append(result.PendingActivities, p) + } + } + + if len(mutableState.GetPendingChildExecutionInfos()) > 0 { + for _, ch := range mutableState.GetPendingChildExecutionInfos() { + childDomainName, err := execution.GetChildExecutionDomainName( + ch, + e.shard.GetDomainCache(), + mutableState.GetDomainEntry(), + ) + if err != nil { + if !common.IsEntityNotExistsError(err) { + return nil, err + } + // child domain already deleted, instead of failing the request, + // return domainID instead since this field is only for information purpose + childDomainName = ch.DomainID + } + p := &types.PendingChildExecutionInfo{ + Domain: childDomainName, + WorkflowID: ch.StartedWorkflowID, + RunID: ch.StartedRunID, + WorkflowTypeName: ch.WorkflowTypeName, + InitiatedID: ch.InitiatedID, + ParentClosePolicy: &ch.ParentClosePolicy, + } + result.PendingChildren = append(result.PendingChildren, p) + } + } + + if di, ok := mutableState.GetPendingDecision(); ok { + pendingDecision := &types.PendingDecisionInfo{ + State: types.PendingDecisionStateScheduled.Ptr(), + ScheduledTimestamp: common.Int64Ptr(di.ScheduledTimestamp), + Attempt: di.Attempt, + OriginalScheduledTimestamp: common.Int64Ptr(di.OriginalScheduledTimestamp), + } + if di.StartedID != common.EmptyEventID { + pendingDecision.State = types.PendingDecisionStateStarted.Ptr() + pendingDecision.StartedTimestamp = common.Int64Ptr(di.StartedTimestamp) + } + result.PendingDecision = pendingDecision + } + + return result, nil +} diff --git a/service/history/engine/engineimpl/dlq_operations.go b/service/history/engine/engineimpl/dlq_operations.go new file mode 100644 index 00000000000..cc9040167e3 --- /dev/null +++ b/service/history/engine/engineimpl/dlq_operations.go @@ -0,0 +1,87 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" +) + +func (e *historyEngineImpl) CountDLQMessages(ctx context.Context, forceFetch bool) (map[string]int64, error) { + return e.replicationDLQHandler.GetMessageCount(ctx, forceFetch) +} + +func (e *historyEngineImpl) ReadDLQMessages( + ctx context.Context, + request *types.ReadDLQMessagesRequest, +) (*types.ReadDLQMessagesResponse, error) { + + tasks, taskInfo, token, err := e.replicationDLQHandler.ReadMessages( + ctx, + request.GetSourceCluster(), + request.GetInclusiveEndMessageID(), + int(request.GetMaximumPageSize()), + request.GetNextPageToken(), + ) + if err != nil { + return nil, err + } + return &types.ReadDLQMessagesResponse{ + Type: request.GetType().Ptr(), + ReplicationTasks: tasks, + ReplicationTasksInfo: taskInfo, + NextPageToken: token, + }, nil +} + +func (e *historyEngineImpl) PurgeDLQMessages( + ctx context.Context, + request *types.PurgeDLQMessagesRequest, +) error { + + return e.replicationDLQHandler.PurgeMessages( + ctx, + request.GetSourceCluster(), + request.GetInclusiveEndMessageID(), + ) +} + +func (e *historyEngineImpl) MergeDLQMessages( + ctx context.Context, + request *types.MergeDLQMessagesRequest, +) (*types.MergeDLQMessagesResponse, error) { + + token, err := e.replicationDLQHandler.MergeMessages( + ctx, + request.GetSourceCluster(), + request.GetInclusiveEndMessageID(), + int(request.GetMaximumPageSize()), + request.GetNextPageToken(), + ) + if err != nil { + return nil, err + } + return &types.MergeDLQMessagesResponse{ + NextPageToken: token, + }, nil +} diff --git a/service/history/engine/engineimpl/get_replication_messages.go b/service/history/engine/engineimpl/get_replication_messages.go new file mode 100644 index 00000000000..9f5ae68ad87 --- /dev/null +++ b/service/history/engine/engineimpl/get_replication_messages.go @@ -0,0 +1,94 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" +) + +func (e *historyEngineImpl) GetReplicationMessages( + ctx context.Context, + pollingCluster string, + lastReadMessageID int64, +) (*types.ReplicationMessages, error) { + + scope := metrics.HistoryGetReplicationMessagesScope + sw := e.metricsClient.StartTimer(scope, metrics.GetReplicationMessagesForShardLatency) + defer sw.Stop() + + replicationMessages, err := e.replicationAckManager.GetTasks( + ctx, + pollingCluster, + lastReadMessageID, + ) + if err != nil { + e.logger.Error("Failed to retrieve replication messages.", tag.Error(err)) + return nil, err + } + + // Set cluster status for sync shard info + replicationMessages.SyncShardStatus = &types.SyncShardStatus{ + Timestamp: common.Int64Ptr(e.timeSource.Now().UnixNano()), + } + e.logger.Debug("Successfully fetched replication messages.", tag.Counter(len(replicationMessages.ReplicationTasks))) + return replicationMessages, nil +} + +func (e *historyEngineImpl) GetDLQReplicationMessages( + ctx context.Context, + taskInfos []*types.ReplicationTaskInfo, +) ([]*types.ReplicationTask, error) { + + scope := metrics.HistoryGetDLQReplicationMessagesScope + sw := e.metricsClient.StartTimer(scope, metrics.GetDLQReplicationMessagesLatency) + defer sw.Stop() + + tasks := make([]*types.ReplicationTask, 0, len(taskInfos)) + for _, taskInfo := range taskInfos { + task, err := e.replicationHydrator.Hydrate(ctx, persistence.ReplicationTaskInfo{ + DomainID: taskInfo.DomainID, + WorkflowID: taskInfo.WorkflowID, + RunID: taskInfo.RunID, + TaskID: taskInfo.TaskID, + TaskType: int(taskInfo.TaskType), + FirstEventID: taskInfo.FirstEventID, + NextEventID: taskInfo.NextEventID, + Version: taskInfo.Version, + ScheduledID: taskInfo.ScheduledID, + }) + if err != nil { + e.logger.Error("Failed to fetch DLQ replication messages.", tag.Error(err)) + return nil, err + } + if task != nil { + tasks = append(tasks, task) + } + } + + return tasks, nil +} diff --git a/service/history/engine/engineimpl/historyEngine.go b/service/history/engine/engineimpl/historyEngine.go deleted file mode 100644 index 9467de2b33c..00000000000 --- a/service/history/engine/engineimpl/historyEngine.go +++ /dev/null @@ -1,3823 +0,0 @@ -// Copyright (c) 2017-2021 Uber Technologies, Inc. -// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -package engineimpl - -import ( - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "time" - - "github.com/pborman/uuid" - "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" - "go.uber.org/yarpc/yarpcerrors" - - "github.com/uber/cadence/client/matching" - "github.com/uber/cadence/client/wrappers/retryable" - "github.com/uber/cadence/common" - "github.com/uber/cadence/common/cache" - "github.com/uber/cadence/common/client" - "github.com/uber/cadence/common/clock" - "github.com/uber/cadence/common/cluster" - "github.com/uber/cadence/common/definition" - "github.com/uber/cadence/common/dynamicconfig" - ce "github.com/uber/cadence/common/errors" - "github.com/uber/cadence/common/log" - "github.com/uber/cadence/common/log/tag" - "github.com/uber/cadence/common/metrics" - cndc "github.com/uber/cadence/common/ndc" - "github.com/uber/cadence/common/persistence" - "github.com/uber/cadence/common/quotas" - "github.com/uber/cadence/common/reconciliation/invariant" - "github.com/uber/cadence/common/service" - "github.com/uber/cadence/common/types" - hcommon "github.com/uber/cadence/service/history/common" - "github.com/uber/cadence/service/history/config" - "github.com/uber/cadence/service/history/decision" - "github.com/uber/cadence/service/history/engine" - "github.com/uber/cadence/service/history/events" - "github.com/uber/cadence/service/history/execution" - "github.com/uber/cadence/service/history/failover" - "github.com/uber/cadence/service/history/ndc" - "github.com/uber/cadence/service/history/query" - "github.com/uber/cadence/service/history/queue" - "github.com/uber/cadence/service/history/replication" - "github.com/uber/cadence/service/history/reset" - "github.com/uber/cadence/service/history/shard" - "github.com/uber/cadence/service/history/task" - "github.com/uber/cadence/service/history/workflow" - "github.com/uber/cadence/service/history/workflowcache" - warchiver "github.com/uber/cadence/service/worker/archiver" -) - -const ( - defaultQueryFirstDecisionTaskWaitTime = time.Second - queryFirstDecisionTaskCheckInterval = 200 * time.Millisecond - contextLockTimeout = 500 * time.Millisecond - longPollCompletionBuffer = 50 * time.Millisecond - - // TerminateIfRunningReason reason for terminateIfRunning - TerminateIfRunningReason = "TerminateIfRunning Policy" - // TerminateIfRunningDetailsTemplate details template for terminateIfRunning - TerminateIfRunningDetailsTemplate = "New runID: %s" -) - -var ( - errDomainDeprecated = &types.BadRequestError{Message: "Domain is deprecated."} -) - -type ( - historyEngineImpl struct { - currentClusterName string - shard shard.Context - timeSource clock.TimeSource - decisionHandler decision.Handler - clusterMetadata cluster.Metadata - historyV2Mgr persistence.HistoryManager - executionManager persistence.ExecutionManager - visibilityMgr persistence.VisibilityManager - txProcessor queue.Processor - timerProcessor queue.Processor - crossClusterProcessor queue.Processor - nDCReplicator ndc.HistoryReplicator - nDCActivityReplicator ndc.ActivityReplicator - historyEventNotifier events.Notifier - tokenSerializer common.TaskTokenSerializer - executionCache *execution.Cache - metricsClient metrics.Client - logger log.Logger - throttledLogger log.Logger - config *config.Config - archivalClient warchiver.Client - workflowResetter reset.WorkflowResetter - queueTaskProcessor task.Processor - crossClusterTaskProcessors common.Daemon - replicationTaskProcessors []replication.TaskProcessor - replicationAckManager replication.TaskAckManager - replicationTaskStore *replication.TaskStore - replicationHydrator replication.TaskHydrator - replicationMetricsEmitter *replication.MetricsEmitterImpl - publicClient workflowserviceclient.Interface - eventsReapplier ndc.EventsReapplier - matchingClient matching.Client - rawMatchingClient matching.Client - clientChecker client.VersionChecker - replicationDLQHandler replication.DLQHandler - failoverMarkerNotifier failover.MarkerNotifier - wfIDCache workflowcache.WFCache - ratelimitInternalPerWorkflowID dynamicconfig.BoolPropertyFnWithDomainFilter - - updateWithActionFn func(context.Context, *execution.Cache, string, types.WorkflowExecution, bool, time.Time, func(wfContext execution.Context, mutableState execution.MutableState) error) error - } -) - -var _ engine.Engine = (*historyEngineImpl)(nil) - -var ( - // FailedWorkflowCloseState is a set of failed workflow close states, used for start workflow policy - // for start workflow execution API - FailedWorkflowCloseState = map[int]bool{ - persistence.WorkflowCloseStatusFailed: true, - persistence.WorkflowCloseStatusCanceled: true, - persistence.WorkflowCloseStatusTerminated: true, - persistence.WorkflowCloseStatusTimedOut: true, - } -) - -// NewEngineWithShardContext creates an instance of history engine -func NewEngineWithShardContext( - shard shard.Context, - visibilityMgr persistence.VisibilityManager, - matching matching.Client, - publicClient workflowserviceclient.Interface, - historyEventNotifier events.Notifier, - config *config.Config, - crossClusterTaskFetchers task.Fetchers, - replicationTaskFetchers replication.TaskFetchers, - rawMatchingClient matching.Client, - queueTaskProcessor task.Processor, - failoverCoordinator failover.Coordinator, - wfIDCache workflowcache.WFCache, - ratelimitInternalPerWorkflowID dynamicconfig.BoolPropertyFnWithDomainFilter, -) engine.Engine { - currentClusterName := shard.GetService().GetClusterMetadata().GetCurrentClusterName() - - logger := shard.GetLogger() - executionManager := shard.GetExecutionManager() - historyV2Manager := shard.GetHistoryManager() - executionCache := execution.NewCache(shard) - failoverMarkerNotifier := failover.NewMarkerNotifier(shard, config, failoverCoordinator) - replicationHydrator := replication.NewDeferredTaskHydrator(shard.GetShardID(), historyV2Manager, executionCache, shard.GetDomainCache()) - replicationTaskStore := replication.NewTaskStore( - shard.GetConfig(), - shard.GetClusterMetadata(), - shard.GetDomainCache(), - shard.GetMetricsClient(), - shard.GetLogger(), - replicationHydrator, - ) - replicationReader := replication.NewDynamicTaskReader(shard.GetShardID(), executionManager, shard.GetTimeSource(), config) - - historyEngImpl := &historyEngineImpl{ - currentClusterName: currentClusterName, - shard: shard, - clusterMetadata: shard.GetClusterMetadata(), - timeSource: shard.GetTimeSource(), - historyV2Mgr: historyV2Manager, - executionManager: executionManager, - visibilityMgr: visibilityMgr, - tokenSerializer: common.NewJSONTaskTokenSerializer(), - executionCache: executionCache, - logger: logger.WithTags(tag.ComponentHistoryEngine), - throttledLogger: shard.GetThrottledLogger().WithTags(tag.ComponentHistoryEngine), - metricsClient: shard.GetMetricsClient(), - historyEventNotifier: historyEventNotifier, - config: config, - archivalClient: warchiver.NewClient( - shard.GetMetricsClient(), - logger, - publicClient, - shard.GetConfig().NumArchiveSystemWorkflows, - quotas.NewDynamicRateLimiter(config.ArchiveRequestRPS.AsFloat64()), - quotas.NewDynamicRateLimiter(func() float64 { - return quotas.PerMember( - service.History, - float64(config.ArchiveInlineHistoryGlobalRPS()), - float64(config.ArchiveInlineHistoryRPS()), - shard.GetService().GetMembershipResolver(), - ) - }), - quotas.NewDynamicRateLimiter(func() float64 { - return quotas.PerMember( - service.History, - float64(config.ArchiveInlineVisibilityGlobalRPS()), - float64(config.ArchiveInlineVisibilityRPS()), - shard.GetService().GetMembershipResolver(), - ) - }), - shard.GetService().GetArchiverProvider(), - config.AllowArchivingIncompleteHistory, - ), - workflowResetter: reset.NewWorkflowResetter( - shard, - executionCache, - logger, - ), - publicClient: publicClient, - matchingClient: matching, - rawMatchingClient: rawMatchingClient, - queueTaskProcessor: queueTaskProcessor, - clientChecker: client.NewVersionChecker(), - failoverMarkerNotifier: failoverMarkerNotifier, - replicationHydrator: replicationHydrator, - replicationAckManager: replication.NewTaskAckManager( - shard.GetShardID(), - shard, - shard.GetMetricsClient(), - shard.GetLogger(), - replicationReader, - replicationTaskStore, - ), - replicationTaskStore: replicationTaskStore, - replicationMetricsEmitter: replication.NewMetricsEmitter( - shard.GetShardID(), shard, replicationReader, shard.GetMetricsClient()), - wfIDCache: wfIDCache, - ratelimitInternalPerWorkflowID: ratelimitInternalPerWorkflowID, - updateWithActionFn: workflow.UpdateWithAction, - } - historyEngImpl.decisionHandler = decision.NewHandler( - shard, - historyEngImpl.executionCache, - historyEngImpl.tokenSerializer, - ) - pRetry := persistence.NewPersistenceRetryer( - shard.GetExecutionManager(), - shard.GetHistoryManager(), - common.CreatePersistenceRetryPolicy(), - ) - openExecutionCheck := invariant.NewConcreteExecutionExists(pRetry, shard.GetDomainCache()) - - historyEngImpl.txProcessor = queue.NewTransferQueueProcessor( - shard, - historyEngImpl, - queueTaskProcessor, - executionCache, - historyEngImpl.workflowResetter, - historyEngImpl.archivalClient, - openExecutionCheck, - historyEngImpl.wfIDCache, - historyEngImpl.ratelimitInternalPerWorkflowID, - ) - - historyEngImpl.timerProcessor = queue.NewTimerQueueProcessor( - shard, - historyEngImpl, - queueTaskProcessor, - executionCache, - historyEngImpl.archivalClient, - openExecutionCheck, - ) - - historyEngImpl.crossClusterProcessor = queue.NewCrossClusterQueueProcessor( - shard, - historyEngImpl, - executionCache, - queueTaskProcessor, - ) - - historyEngImpl.eventsReapplier = ndc.NewEventsReapplier(shard.GetMetricsClient(), logger) - - historyEngImpl.nDCReplicator = ndc.NewHistoryReplicator( - shard, - executionCache, - historyEngImpl.eventsReapplier, - logger, - ) - historyEngImpl.nDCActivityReplicator = ndc.NewActivityReplicator( - shard, - executionCache, - logger, - ) - - historyEngImpl.crossClusterTaskProcessors = task.NewCrossClusterTaskProcessors( - shard, - queueTaskProcessor, - crossClusterTaskFetchers, - &task.CrossClusterTaskProcessorOptions{ - Enabled: config.EnableCrossClusterEngine, - MaxPendingTasks: config.CrossClusterTargetProcessorMaxPendingTasks, - TaskMaxRetryCount: config.CrossClusterTargetProcessorMaxRetryCount, - TaskRedispatchInterval: config.ActiveTaskRedispatchInterval, - TaskWaitInterval: config.CrossClusterTargetProcessorTaskWaitInterval, - ServiceBusyBackoffInterval: config.CrossClusterTargetProcessorServiceBusyBackoffInterval, - TimerJitterCoefficient: config.CrossClusterTargetProcessorJitterCoefficient, - }, - ) - - var replicationTaskProcessors []replication.TaskProcessor - replicationTaskExecutors := make(map[string]replication.TaskExecutor) - // Intentionally use the raw client to create its own retry policy - historyRawClient := shard.GetService().GetClientBean().GetHistoryClient() - historyRetryableClient := retryable.NewHistoryClient( - historyRawClient, - common.CreateReplicationServiceBusyRetryPolicy(), - common.IsServiceBusyError, - ) - resendFunc := func(ctx context.Context, request *types.ReplicateEventsV2Request) error { - return historyRetryableClient.ReplicateEventsV2(ctx, request) - } - for _, replicationTaskFetcher := range replicationTaskFetchers.GetFetchers() { - sourceCluster := replicationTaskFetcher.GetSourceCluster() - // Intentionally use the raw client to create its own retry policy - adminClient := shard.GetService().GetClientBean().GetRemoteAdminClient(sourceCluster) - adminRetryableClient := retryable.NewAdminClient( - adminClient, - common.CreateReplicationServiceBusyRetryPolicy(), - common.IsServiceBusyError, - ) - historyResender := cndc.NewHistoryResender( - shard.GetDomainCache(), - adminRetryableClient, - resendFunc, - nil, - openExecutionCheck, - shard.GetLogger(), - ) - replicationTaskExecutor := replication.NewTaskExecutor( - shard, - shard.GetDomainCache(), - historyResender, - historyEngImpl, - shard.GetMetricsClient(), - shard.GetLogger(), - ) - replicationTaskExecutors[sourceCluster] = replicationTaskExecutor - - replicationTaskProcessor := replication.NewTaskProcessor( - shard, - historyEngImpl, - config, - shard.GetMetricsClient(), - replicationTaskFetcher, - replicationTaskExecutor, - ) - replicationTaskProcessors = append(replicationTaskProcessors, replicationTaskProcessor) - } - historyEngImpl.replicationTaskProcessors = replicationTaskProcessors - replicationMessageHandler := replication.NewDLQHandler(shard, replicationTaskExecutors) - historyEngImpl.replicationDLQHandler = replicationMessageHandler - - shard.SetEngine(historyEngImpl) - return historyEngImpl -} - -// Start will spin up all the components needed to start serving this shard. -// Make sure all the components are loaded lazily so start can return immediately. This is important because -// ShardController calls start sequentially for all the shards for a given host during startup. -func (e *historyEngineImpl) Start() { - e.logger.Info("History engine state changed", tag.LifeCycleStarting) - defer e.logger.Info("History engine state changed", tag.LifeCycleStarted) - - e.txProcessor.Start() - e.timerProcessor.Start() - e.crossClusterProcessor.Start() - e.replicationDLQHandler.Start() - e.replicationMetricsEmitter.Start() - - // failover callback will try to create a failover queue processor to scan all inflight tasks - // if domain needs to be failovered. However, in the multicursor queue logic, the scan range - // can't be retrieved before the processor is started. If failover callback is registered - // before queue processor is started, it may result in a deadline as to create the failover queue, - // queue processor need to be started. - e.registerDomainFailoverCallback() - - e.crossClusterTaskProcessors.Start() - - for _, replicationTaskProcessor := range e.replicationTaskProcessors { - replicationTaskProcessor.Start() - } - if e.config.EnableGracefulFailover() { - e.failoverMarkerNotifier.Start() - } - -} - -// Stop the service. -func (e *historyEngineImpl) Stop() { - e.logger.Info("History engine state changed", tag.LifeCycleStopping) - defer e.logger.Info("History engine state changed", tag.LifeCycleStopped) - - e.txProcessor.Stop() - e.timerProcessor.Stop() - e.crossClusterProcessor.Stop() - e.replicationDLQHandler.Stop() - e.replicationMetricsEmitter.Stop() - - e.crossClusterTaskProcessors.Stop() - - for _, replicationTaskProcessor := range e.replicationTaskProcessors { - replicationTaskProcessor.Stop() - } - - if e.queueTaskProcessor != nil { - e.queueTaskProcessor.StopShardProcessor(e.shard) - } - - e.failoverMarkerNotifier.Stop() - - // unset the failover callback - e.shard.GetDomainCache().UnregisterDomainChangeCallback(e.shard.GetShardID()) -} - -func (e *historyEngineImpl) registerDomainFailoverCallback() { - - // NOTE: READ BEFORE MODIFICATION - // - // Tasks, e.g. transfer tasks and timer tasks, are created when holding the shard lock - // meaning tasks -> release of shard lock - // - // Domain change notification follows the following steps, order matters - // 1. lock all task processing. - // 2. domain changes visible to everyone (Note: lock of task processing prevents task processing logic seeing the domain changes). - // 3. failover min and max task levels are calculated, then update to shard. - // 4. failover start & task processing unlock & shard domain version notification update. (order does not matter for this discussion) - // - // The above guarantees that task created during the failover will be processed. - // If the task is created after domain change: - // then active processor will handle it. (simple case) - // If the task is created before domain change: - // task -> release of shard lock - // failover min / max task levels calculated & updated to shard (using shard lock) -> failover start - // above 2 guarantees that failover start is after persistence of the task. - - failoverPredicate := func(shardNotificationVersion int64, nextDomain *cache.DomainCacheEntry, action func()) { - domainFailoverNotificationVersion := nextDomain.GetFailoverNotificationVersion() - domainActiveCluster := nextDomain.GetReplicationConfig().ActiveClusterName - - if nextDomain.IsGlobalDomain() && - domainFailoverNotificationVersion >= shardNotificationVersion && - domainActiveCluster == e.currentClusterName { - action() - } - } - - // first set the failover callback - e.shard.GetDomainCache().RegisterDomainChangeCallback( - e.shard.GetShardID(), - e.shard.GetDomainNotificationVersion(), - func() { - e.txProcessor.LockTaskProcessing() - e.timerProcessor.LockTaskProcessing() - // there no lock/unlock for crossClusterProcessor - }, - func(nextDomains []*cache.DomainCacheEntry) { - defer func() { - e.txProcessor.UnlockTaskProcessing() - e.timerProcessor.UnlockTaskProcessing() - // there no lock/unlock for crossClusterProcessor - }() - - if len(nextDomains) == 0 { - return - } - - shardNotificationVersion := e.shard.GetDomainNotificationVersion() - failoverDomainIDs := map[string]struct{}{} - - for _, nextDomain := range nextDomains { - failoverPredicate(shardNotificationVersion, nextDomain, func() { - failoverDomainIDs[nextDomain.GetInfo().ID] = struct{}{} - }) - } - - if len(failoverDomainIDs) > 0 { - e.logger.Info("Domain Failover Start.", tag.WorkflowDomainIDs(failoverDomainIDs)) - - e.txProcessor.FailoverDomain(failoverDomainIDs) - e.timerProcessor.FailoverDomain(failoverDomainIDs) - e.crossClusterProcessor.FailoverDomain(failoverDomainIDs) - - now := e.shard.GetTimeSource().Now() - // the fake tasks will not be actually used, we just need to make sure - // its length > 0 and has correct timestamp, to trigger a db scan - fakeDecisionTask := []persistence.Task{&persistence.DecisionTask{}} - fakeDecisionTimeoutTask := []persistence.Task{&persistence.DecisionTimeoutTask{TaskData: persistence.TaskData{VisibilityTimestamp: now}}} - e.txProcessor.NotifyNewTask(e.currentClusterName, &hcommon.NotifyTaskInfo{Tasks: fakeDecisionTask}) - e.timerProcessor.NotifyNewTask(e.currentClusterName, &hcommon.NotifyTaskInfo{Tasks: fakeDecisionTimeoutTask}) - } - - // handle graceful failover on active to passive - // make sure task processor failover the domain before inserting the failover marker - failoverMarkerTasks := []*persistence.FailoverMarkerTask{} - for _, nextDomain := range nextDomains { - domainFailoverNotificationVersion := nextDomain.GetFailoverNotificationVersion() - domainActiveCluster := nextDomain.GetReplicationConfig().ActiveClusterName - previousFailoverVersion := nextDomain.GetPreviousFailoverVersion() - previousClusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(previousFailoverVersion) - if err != nil { - e.logger.Error("Failed to handle graceful failover", tag.WorkflowDomainID(nextDomain.GetInfo().ID), tag.Error(err)) - continue - } - - if nextDomain.IsGlobalDomain() && - domainFailoverNotificationVersion >= shardNotificationVersion && - domainActiveCluster != e.currentClusterName && - previousFailoverVersion != common.InitialPreviousFailoverVersion && - previousClusterName == e.currentClusterName { - // the visibility timestamp will be set in shard context - failoverMarkerTasks = append(failoverMarkerTasks, &persistence.FailoverMarkerTask{ - TaskData: persistence.TaskData{ - Version: nextDomain.GetFailoverVersion(), - }, - DomainID: nextDomain.GetInfo().ID, - }) - // This is a debug metric - e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.FailoverMarkerCallbackCount) - } - } - - // This is a debug metric - e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.HistoryFailoverCallbackCount) - if len(failoverMarkerTasks) > 0 { - if err := e.shard.ReplicateFailoverMarkers( - context.Background(), - failoverMarkerTasks, - ); err != nil { - e.logger.Error("Failed to insert failover marker to replication queue.", tag.Error(err)) - e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.FailoverMarkerInsertFailure) - // fail this failover callback and it retries on next domain cache refresh - return - } - } - - //nolint:errcheck - e.shard.UpdateDomainNotificationVersion(nextDomains[len(nextDomains)-1].GetNotificationVersion() + 1) - }, - ) -} - -func (e *historyEngineImpl) createMutableState( - domainEntry *cache.DomainCacheEntry, - runID string, -) (execution.MutableState, error) { - - newMutableState := execution.NewMutableStateBuilderWithVersionHistories( - e.shard, - e.logger, - domainEntry, - ) - - if err := newMutableState.SetHistoryTree(runID); err != nil { - return nil, err - } - - return newMutableState, nil -} - -func (e *historyEngineImpl) generateFirstDecisionTask( - mutableState execution.MutableState, - parentInfo *types.ParentExecutionInfo, - startEvent *types.HistoryEvent, -) error { - - if parentInfo == nil { - // DecisionTask is only created when it is not a Child Workflow and no backoff is needed - if err := mutableState.AddFirstDecisionTaskScheduled( - startEvent, - ); err != nil { - return err - } - } - return nil -} - -// StartWorkflowExecution starts a workflow execution -func (e *historyEngineImpl) StartWorkflowExecution( - ctx context.Context, - startRequest *types.HistoryStartWorkflowExecutionRequest, -) (resp *types.StartWorkflowExecutionResponse, retError error) { - - domainEntry, err := e.getActiveDomainByID(startRequest.DomainUUID) - if err != nil { - return nil, err - } - - return e.startWorkflowHelper( - ctx, - startRequest, - domainEntry, - metrics.HistoryStartWorkflowExecutionScope, - nil) -} - -// for startWorkflowHelper be reused by signalWithStart -type signalWithStartArg struct { - signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest - prevMutableState execution.MutableState -} - -func (e *historyEngineImpl) newDomainNotActiveError( - domainName string, - failoverVersion int64, -) error { - clusterMetadata := e.shard.GetService().GetClusterMetadata() - clusterName, err := clusterMetadata.ClusterNameForFailoverVersion(failoverVersion) - if err != nil { - clusterName = "_unknown_" - } - return ce.NewDomainNotActiveError( - domainName, - clusterMetadata.GetCurrentClusterName(), - clusterName, - ) -} - -func (e *historyEngineImpl) startWorkflowHelper( - ctx context.Context, - startRequest *types.HistoryStartWorkflowExecutionRequest, - domainEntry *cache.DomainCacheEntry, - metricsScope int, - signalWithStartArg *signalWithStartArg, -) (resp *types.StartWorkflowExecutionResponse, retError error) { - - if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { - return nil, errDomainDeprecated - } - - request := startRequest.StartRequest - err := e.validateStartWorkflowExecutionRequest(request, metricsScope) - if err != nil { - return nil, err - } - e.overrideStartWorkflowExecutionRequest(domainEntry, request, metricsScope) - - workflowID := request.GetWorkflowID() - domainID := domainEntry.GetInfo().ID - domain := domainEntry.GetInfo().Name - - // grab the current context as a lock, nothing more - // use a smaller context timeout to get the lock - childCtx, childCancel := e.newChildContext(ctx) - defer childCancel() - - _, currentRelease, err := e.executionCache.GetOrCreateCurrentWorkflowExecution( - childCtx, - domainID, - workflowID, - ) - if err != nil { - if err == context.DeadlineExceeded { - return nil, workflow.ErrConcurrentStartRequest - } - return nil, err - } - defer func() { currentRelease(retError) }() - - workflowExecution := types.WorkflowExecution{ - WorkflowID: workflowID, - RunID: uuid.New(), - } - curMutableState, err := e.createMutableState(domainEntry, workflowExecution.GetRunID()) - if err != nil { - return nil, err - } - - // preprocess for signalWithStart - var prevMutableState execution.MutableState - var signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest - isSignalWithStart := signalWithStartArg != nil - if isSignalWithStart { - prevMutableState = signalWithStartArg.prevMutableState - signalWithStartRequest = signalWithStartArg.signalWithStartRequest - } - if prevMutableState != nil { - prevLastWriteVersion, err := prevMutableState.GetLastWriteVersion() - if err != nil { - return nil, err - } - if prevLastWriteVersion > curMutableState.GetCurrentVersion() { - return nil, e.newDomainNotActiveError( - domainEntry.GetInfo().Name, - prevLastWriteVersion, - ) - } - err = e.applyWorkflowIDReusePolicyForSigWithStart( - prevMutableState.GetExecutionInfo(), - workflowExecution, - request.GetWorkflowIDReusePolicy(), - ) - if err != nil { - return nil, err - } - } else if e.shard.GetConfig().EnableRecordWorkflowExecutionUninitialized(domainEntry.GetInfo().Name) && e.visibilityMgr != nil { - uninitializedRequest := &persistence.RecordWorkflowExecutionUninitializedRequest{ - DomainUUID: domainID, - Domain: domain, - Execution: types.WorkflowExecution{ - WorkflowID: workflowID, - RunID: workflowExecution.RunID, - }, - WorkflowTypeName: request.WorkflowType.Name, - UpdateTimestamp: e.shard.GetTimeSource().Now().UnixNano(), - ShardID: int64(e.shard.GetShardID()), - } - - if err := e.visibilityMgr.RecordWorkflowExecutionUninitialized(ctx, uninitializedRequest); err != nil { - e.logger.Error("Failed to record uninitialized workflow execution", tag.Error(err)) - } - } - - err = e.addStartEventsAndTasks( - curMutableState, - workflowExecution, - startRequest, - signalWithStartRequest, - ) - if err != nil { - if e.shard.GetConfig().EnableRecordWorkflowExecutionUninitialized(domainEntry.GetInfo().Name) && e.visibilityMgr != nil { - // delete the uninitialized workflow execution record since it failed to start the workflow - // uninitialized record is used to find wfs that didn't make a progress or stuck during the start process - if errVisibility := e.visibilityMgr.DeleteWorkflowExecution(ctx, &persistence.VisibilityDeleteWorkflowExecutionRequest{ - DomainID: domainID, - Domain: domain, - RunID: workflowExecution.RunID, - WorkflowID: workflowID, - }); errVisibility != nil { - e.logger.Error("Failed to delete uninitialized workflow execution record", tag.Error(errVisibility)) - } - } - - return nil, err - } - wfContext := execution.NewContext(domainID, workflowExecution, e.shard, e.executionManager, e.logger) - - newWorkflow, newWorkflowEventsSeq, err := curMutableState.CloseTransactionAsSnapshot( - e.timeSource.Now(), - execution.TransactionPolicyActive, - ) - if err != nil { - return nil, err - } - historyBlob, err := wfContext.PersistStartWorkflowBatchEvents(ctx, newWorkflowEventsSeq[0]) - if err != nil { - return nil, err - } - - // create as brand new - createMode := persistence.CreateWorkflowModeBrandNew - prevRunID := "" - prevLastWriteVersion := int64(0) - // overwrite in case of signalWithStart - if prevMutableState != nil { - createMode = persistence.CreateWorkflowModeWorkflowIDReuse - info := prevMutableState.GetExecutionInfo() - // For corrupted workflows use ContinueAsNew mode. - // WorkflowIDReuse mode require workflows to be in completed state, which is not necessarily true for corrupted workflows. - if info.State == persistence.WorkflowStateCorrupted { - createMode = persistence.CreateWorkflowModeContinueAsNew - } - prevRunID = info.RunID - prevLastWriteVersion, err = prevMutableState.GetLastWriteVersion() - if err != nil { - return nil, err - } - } - err = wfContext.CreateWorkflowExecution( - ctx, - newWorkflow, - historyBlob, - createMode, - prevRunID, - prevLastWriteVersion, - persistence.CreateWorkflowRequestModeNew, - ) - if t, ok := persistence.AsDuplicateRequestError(err); ok { - if t.RequestType == persistence.WorkflowRequestTypeStart || (isSignalWithStart && t.RequestType == persistence.WorkflowRequestTypeSignal) { - return &types.StartWorkflowExecutionResponse{ - RunID: t.RunID, - }, nil - } - e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) - return nil, t - } - // handle already started error - if t, ok := err.(*persistence.WorkflowExecutionAlreadyStartedError); ok { - - if t.StartRequestID == request.GetRequestID() { - return &types.StartWorkflowExecutionResponse{ - RunID: t.RunID, - }, nil - } - - if isSignalWithStart { - return nil, err - } - - if curMutableState.GetCurrentVersion() < t.LastWriteVersion { - return nil, e.newDomainNotActiveError( - domainEntry.GetInfo().Name, - t.LastWriteVersion, - ) - } - - prevRunID = t.RunID - if shouldTerminateAndStart(startRequest, t.State) { - runningWFCtx, err := workflow.LoadOnce(ctx, e.executionCache, domainID, workflowID, prevRunID) - if err != nil { - return nil, err - } - defer func() { runningWFCtx.GetReleaseFn()(retError) }() - - resp, err = e.terminateAndStartWorkflow( - ctx, - runningWFCtx, - workflowExecution, - domainEntry, - domainID, - startRequest, - nil, - ) - switch err.(type) { - // By the time we try to terminate the workflow, it was already terminated - // So continue as if we didn't need to terminate it in the first place - case *types.WorkflowExecutionAlreadyCompletedError: - e.shard.GetLogger().Warn("Workflow completed while trying to terminate, will continue starting workflow", tag.Error(err)) - default: - return resp, err - } - } - if err = e.applyWorkflowIDReusePolicyHelper( - t.StartRequestID, - prevRunID, - t.State, - t.CloseStatus, - workflowExecution, - startRequest.StartRequest.GetWorkflowIDReusePolicy(), - ); err != nil { - return nil, err - } - // create as ID reuse - createMode = persistence.CreateWorkflowModeWorkflowIDReuse - err = wfContext.CreateWorkflowExecution( - ctx, - newWorkflow, - historyBlob, - createMode, - prevRunID, - t.LastWriteVersion, - persistence.CreateWorkflowRequestModeNew, - ) - if t, ok := persistence.AsDuplicateRequestError(err); ok { - if t.RequestType == persistence.WorkflowRequestTypeStart || (isSignalWithStart && t.RequestType == persistence.WorkflowRequestTypeSignal) { - return &types.StartWorkflowExecutionResponse{ - RunID: t.RunID, - }, nil - } - e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) - return nil, t - } - } - if err != nil { - return nil, err - } - - return &types.StartWorkflowExecutionResponse{ - RunID: workflowExecution.RunID, - }, nil -} - -func shouldTerminateAndStart( - startRequest *types.HistoryStartWorkflowExecutionRequest, - state int, -) bool { - return startRequest.StartRequest.GetWorkflowIDReusePolicy() == types.WorkflowIDReusePolicyTerminateIfRunning && - (state == persistence.WorkflowStateRunning || state == persistence.WorkflowStateCreated) -} - -// terminate running workflow then start a new run in one transaction -func (e *historyEngineImpl) terminateAndStartWorkflow( - ctx context.Context, - runningWFCtx workflow.Context, - workflowExecution types.WorkflowExecution, - domainEntry *cache.DomainCacheEntry, - domainID string, - startRequest *types.HistoryStartWorkflowExecutionRequest, - signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, -) (*types.StartWorkflowExecutionResponse, error) { - runningMutableState := runningWFCtx.GetMutableState() -UpdateWorkflowLoop: - for attempt := 0; attempt < workflow.ConditionalRetryCount; attempt++ { - if !runningMutableState.IsWorkflowExecutionRunning() { - return nil, workflow.ErrAlreadyCompleted - } - - if err := execution.TerminateWorkflow( - runningMutableState, - runningMutableState.GetNextEventID(), - TerminateIfRunningReason, - getTerminateIfRunningDetails(workflowExecution.GetRunID()), - execution.IdentityHistoryService, - ); err != nil { - if err == workflow.ErrStaleState { - // Handler detected that cached workflow mutable could potentially be stale - // Reload workflow execution history - runningWFCtx.GetContext().Clear() - if attempt != workflow.ConditionalRetryCount-1 { - _, err = runningWFCtx.ReloadMutableState(ctx) - if err != nil { - return nil, err - } - } - continue UpdateWorkflowLoop - } - return nil, err - } - - // new mutable state - newMutableState, err := e.createMutableState(domainEntry, workflowExecution.GetRunID()) - if err != nil { - return nil, err - } - - if signalWithStartRequest != nil { - startRequest, err = getStartRequest(domainID, signalWithStartRequest.SignalWithStartRequest, signalWithStartRequest.PartitionConfig) - if err != nil { - return nil, err - } - } - - err = e.addStartEventsAndTasks( - newMutableState, - workflowExecution, - startRequest, - signalWithStartRequest, - ) - if err != nil { - return nil, err - } - - updateErr := runningWFCtx.GetContext().UpdateWorkflowExecutionWithNewAsActive( - ctx, - e.timeSource.Now(), - execution.NewContext( - domainID, - workflowExecution, - e.shard, - e.shard.GetExecutionManager(), - e.logger, - ), - newMutableState, - ) - if updateErr != nil { - if execution.IsConflictError(updateErr) { - e.metricsClient.IncCounter(metrics.HistoryStartWorkflowExecutionScope, metrics.ConcurrencyUpdateFailureCounter) - continue UpdateWorkflowLoop - } - return nil, updateErr - } - break UpdateWorkflowLoop - } - return &types.StartWorkflowExecutionResponse{ - RunID: workflowExecution.RunID, - }, nil -} - -func (e *historyEngineImpl) addStartEventsAndTasks( - mutableState execution.MutableState, - workflowExecution types.WorkflowExecution, - startRequest *types.HistoryStartWorkflowExecutionRequest, - signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, -) error { - // Add WF start event - startEvent, err := mutableState.AddWorkflowExecutionStartedEvent( - workflowExecution, - startRequest, - ) - if err != nil { - return &types.InternalServiceError{ - Message: "Failed to add workflow execution started event.", - } - } - - if signalWithStartRequest != nil { - // Add signal event - sRequest := signalWithStartRequest.SignalWithStartRequest - if sRequest.GetRequestID() != "" { - mutableState.AddSignalRequested(sRequest.GetRequestID()) - } - _, err := mutableState.AddWorkflowExecutionSignaled( - sRequest.GetSignalName(), - sRequest.GetSignalInput(), - sRequest.GetIdentity(), - sRequest.GetRequestID(), - ) - if err != nil { - return &types.InternalServiceError{Message: "Failed to add workflow execution signaled event."} - } - } - - // Generate first decision task event if not child WF and no first decision task backoff - return e.generateFirstDecisionTask( - mutableState, - startRequest.ParentExecutionInfo, - startEvent, - ) -} - -func getTerminateIfRunningDetails(newRunID string) []byte { - return []byte(fmt.Sprintf(TerminateIfRunningDetailsTemplate, newRunID)) -} - -// GetMutableState retrieves the mutable state of the workflow execution -func (e *historyEngineImpl) GetMutableState( - ctx context.Context, - request *types.GetMutableStateRequest, -) (*types.GetMutableStateResponse, error) { - - return e.getMutableStateOrPolling(ctx, request) -} - -// PollMutableState retrieves the mutable state of the workflow execution with long polling -func (e *historyEngineImpl) PollMutableState( - ctx context.Context, - request *types.PollMutableStateRequest, -) (*types.PollMutableStateResponse, error) { - - response, err := e.getMutableStateOrPolling(ctx, &types.GetMutableStateRequest{ - DomainUUID: request.DomainUUID, - Execution: request.Execution, - ExpectedNextEventID: request.ExpectedNextEventID, - CurrentBranchToken: request.CurrentBranchToken}) - - if err != nil { - return nil, e.updateEntityNotExistsErrorOnPassiveCluster(err, request.GetDomainUUID()) - } - - return &types.PollMutableStateResponse{ - Execution: response.Execution, - WorkflowType: response.WorkflowType, - NextEventID: response.NextEventID, - PreviousStartedEventID: response.PreviousStartedEventID, - LastFirstEventID: response.LastFirstEventID, - TaskList: response.TaskList, - StickyTaskList: response.StickyTaskList, - ClientLibraryVersion: response.ClientLibraryVersion, - ClientFeatureVersion: response.ClientFeatureVersion, - ClientImpl: response.ClientImpl, - StickyTaskListScheduleToStartTimeout: response.StickyTaskListScheduleToStartTimeout, - CurrentBranchToken: response.CurrentBranchToken, - VersionHistories: response.VersionHistories, - WorkflowState: response.WorkflowState, - WorkflowCloseState: response.WorkflowCloseState, - }, nil -} - -func (e *historyEngineImpl) updateEntityNotExistsErrorOnPassiveCluster(err error, domainID string) error { - switch err.(type) { - case *types.EntityNotExistsError: - domainEntry, domainCacheErr := e.shard.GetDomainCache().GetDomainByID(domainID) - if domainCacheErr != nil { - return err // if could not access domain cache simply return original error - } - - if _, domainNotActiveErr := domainEntry.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()); domainNotActiveErr != nil { - domainNotActiveErrCasted := domainNotActiveErr.(*types.DomainNotActiveError) - return &types.EntityNotExistsError{ - Message: "Workflow execution not found in non-active cluster", - ActiveCluster: domainNotActiveErrCasted.GetActiveCluster(), - CurrentCluster: domainNotActiveErrCasted.GetCurrentCluster(), - } - } - } - return err -} - -func (e *historyEngineImpl) getMutableStateOrPolling( - ctx context.Context, - request *types.GetMutableStateRequest, -) (*types.GetMutableStateResponse, error) { - - if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { - return nil, err - } - domainID := request.DomainUUID - execution := types.WorkflowExecution{ - WorkflowID: request.Execution.WorkflowID, - RunID: request.Execution.RunID, - } - response, err := e.getMutableState(ctx, domainID, execution) - if err != nil { - return nil, err - } - if request.CurrentBranchToken == nil { - request.CurrentBranchToken = response.CurrentBranchToken - } - if !bytes.Equal(request.CurrentBranchToken, response.CurrentBranchToken) { - return nil, &types.CurrentBranchChangedError{ - Message: "current branch token and request branch token doesn't match", - CurrentBranchToken: response.CurrentBranchToken} - } - // set the run id in case query the current running workflow - execution.RunID = response.Execution.RunID - - // expectedNextEventID is 0 when caller want to get the current next event ID without blocking - expectedNextEventID := common.FirstEventID - if request.ExpectedNextEventID != 0 { - expectedNextEventID = request.GetExpectedNextEventID() - } - - // if caller decide to long poll on workflow execution - // and the event ID we are looking for is smaller than current next event ID - if expectedNextEventID >= response.GetNextEventID() && response.GetIsWorkflowRunning() { - subscriberID, channel, err := e.historyEventNotifier.WatchHistoryEvent(definition.NewWorkflowIdentifier(domainID, execution.GetWorkflowID(), execution.GetRunID())) - if err != nil { - return nil, err - } - defer e.historyEventNotifier.UnwatchHistoryEvent(definition.NewWorkflowIdentifier(domainID, execution.GetWorkflowID(), execution.GetRunID()), subscriberID) //nolint:errcheck - // check again in case the next event ID is updated - response, err = e.getMutableState(ctx, domainID, execution) - if err != nil { - return nil, err - } - // check again if the current branch token changed - if !bytes.Equal(request.CurrentBranchToken, response.CurrentBranchToken) { - return nil, &types.CurrentBranchChangedError{ - Message: "current branch token and request branch token doesn't match", - CurrentBranchToken: response.CurrentBranchToken} - } - if expectedNextEventID < response.GetNextEventID() || !response.GetIsWorkflowRunning() { - return response, nil - } - - domainName, err := e.shard.GetDomainCache().GetDomainName(domainID) - if err != nil { - return nil, err - } - - expirationInterval := e.shard.GetConfig().LongPollExpirationInterval(domainName) - if deadline, ok := ctx.Deadline(); ok { - remainingTime := deadline.Sub(e.shard.GetTimeSource().Now()) - // Here we return a safeguard error, to ensure that older clients are not stuck in long poll loop until context fully expires. - // Otherwise it results in multiple additional requests being made that returns empty responses. - // Newer clients will not make request with too small timeout remaining. - if remainingTime < longPollCompletionBuffer { - return nil, context.DeadlineExceeded - } - // longPollCompletionBuffer is here to leave some room to finish current request without its timeout. - expirationInterval = common.MinDuration( - expirationInterval, - remainingTime-longPollCompletionBuffer, - ) - } - if expirationInterval <= 0 { - return response, nil - } - timer := time.NewTimer(expirationInterval) - defer timer.Stop() - for { - select { - case event := <-channel: - response.LastFirstEventID = event.LastFirstEventID - response.NextEventID = event.NextEventID - response.IsWorkflowRunning = event.WorkflowCloseState == persistence.WorkflowCloseStatusNone - response.PreviousStartedEventID = common.Int64Ptr(event.PreviousStartedEventID) - response.WorkflowState = common.Int32Ptr(int32(event.WorkflowState)) - response.WorkflowCloseState = common.Int32Ptr(int32(event.WorkflowCloseState)) - if !bytes.Equal(request.CurrentBranchToken, event.CurrentBranchToken) { - return nil, &types.CurrentBranchChangedError{ - Message: "Current branch token and request branch token doesn't match", - CurrentBranchToken: event.CurrentBranchToken} - } - if expectedNextEventID < response.GetNextEventID() || !response.GetIsWorkflowRunning() { - return response, nil - } - case <-timer.C: - return response, nil - } - } - } - - return response, nil -} - -func (e *historyEngineImpl) QueryWorkflow( - ctx context.Context, - request *types.HistoryQueryWorkflowRequest, -) (retResp *types.HistoryQueryWorkflowResponse, retErr error) { - - scope := e.metricsClient.Scope(metrics.HistoryQueryWorkflowScope).Tagged(metrics.DomainTag(request.GetRequest().GetDomain())) - shardMetricScope := e.metricsClient.Scope(metrics.HistoryQueryWorkflowScope, metrics.ShardIDTag(e.shard.GetShardID())) - - consistentQueryEnabled := e.config.EnableConsistentQuery() && e.config.EnableConsistentQueryByDomain(request.GetRequest().GetDomain()) - if request.GetRequest().GetQueryConsistencyLevel() == types.QueryConsistencyLevelStrong { - if !consistentQueryEnabled { - return nil, workflow.ErrConsistentQueryNotEnabled - } - shardMetricScope.IncCounter(metrics.ConsistentQueryPerShard) - e.logger.SampleInfo("History QueryWorkflow called with QueryConsistencyLevelStrong", e.config.SampleLoggingRate(), tag.ShardID(e.shard.GetShardID()), tag.WorkflowID(request.GetRequest().Execution.WorkflowID), tag.WorkflowDomainName(request.GetRequest().Domain)) - } - - execution := *request.GetRequest().GetExecution() - - mutableStateResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) - if err != nil { - return nil, err - } - req := request.GetRequest() - if !mutableStateResp.GetIsWorkflowRunning() && req.QueryRejectCondition != nil { - notOpenReject := req.GetQueryRejectCondition() == types.QueryRejectConditionNotOpen - closeStatus := mutableStateResp.GetWorkflowCloseState() - notCompletedCleanlyReject := req.GetQueryRejectCondition() == types.QueryRejectConditionNotCompletedCleanly && closeStatus != persistence.WorkflowCloseStatusCompleted - if notOpenReject || notCompletedCleanlyReject { - return &types.HistoryQueryWorkflowResponse{ - Response: &types.QueryWorkflowResponse{ - QueryRejected: &types.QueryRejected{ - CloseStatus: persistence.ToInternalWorkflowExecutionCloseStatus(int(closeStatus)), - }, - }, - }, nil - } - } - - // query cannot be processed unless at least one decision task has finished - // if first decision task has not finished wait for up to a second for it to complete - queryFirstDecisionTaskWaitTime := defaultQueryFirstDecisionTaskWaitTime - ctxDeadline, ok := ctx.Deadline() - if ok { - ctxWaitTime := time.Until(ctxDeadline) - time.Second - if ctxWaitTime > queryFirstDecisionTaskWaitTime { - queryFirstDecisionTaskWaitTime = ctxWaitTime - } - } - deadline := time.Now().Add(queryFirstDecisionTaskWaitTime) - for mutableStateResp.GetPreviousStartedEventID() <= 0 && time.Now().Before(deadline) { - <-time.After(queryFirstDecisionTaskCheckInterval) - mutableStateResp, err = e.getMutableState(ctx, request.GetDomainUUID(), execution) - if err != nil { - return nil, err - } - } - - if mutableStateResp.GetPreviousStartedEventID() <= 0 { - scope.IncCounter(metrics.QueryBeforeFirstDecisionCount) - return nil, workflow.ErrQueryWorkflowBeforeFirstDecision - } - - de, err := e.shard.GetDomainCache().GetDomainByID(request.GetDomainUUID()) - if err != nil { - return nil, err - } - - wfContext, release, err := e.executionCache.GetOrCreateWorkflowExecution(ctx, request.GetDomainUUID(), execution) - if err != nil { - return nil, err - } - defer func() { release(retErr) }() - mutableState, err := wfContext.LoadWorkflowExecution(ctx) - if err != nil { - return nil, err - } - // If history is corrupted, query will be rejected - if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { - return nil, err - } else if corrupted { - return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} - } - - // There are two ways in which queries get dispatched to decider. First, queries can be dispatched on decision tasks. - // These decision tasks potentially contain new events and queries. The events are treated as coming before the query in time. - // The second way in which queries are dispatched to decider is directly through matching; in this approach queries can be - // dispatched to decider immediately even if there are outstanding events that came before the query. The following logic - // is used to determine if a query can be safely dispatched directly through matching or if given the desired consistency - // level must be dispatched on a decision task. There are four cases in which a query can be dispatched directly through - // matching safely, without violating the desired consistency level: - // 1. the domain is not active, in this case history is immutable so a query dispatched at any time is consistent - // 2. the workflow is not running, whenever a workflow is not running dispatching query directly is consistent - // 3. the client requested eventual consistency, in this case there are no consistency requirements so dispatching directly through matching is safe - // 4. if there is no pending or started decision it means no events came before query arrived, so its safe to dispatch directly - isActive, _ := de.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()) - safeToDispatchDirectly := !isActive || - !mutableState.IsWorkflowExecutionRunning() || - req.GetQueryConsistencyLevel() == types.QueryConsistencyLevelEventual || - (!mutableState.HasPendingDecision() && !mutableState.HasInFlightDecision()) - if safeToDispatchDirectly { - release(nil) - msResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) - if err != nil { - return nil, err - } - req.Execution.RunID = msResp.Execution.RunID - return e.queryDirectlyThroughMatching(ctx, msResp, request.GetDomainUUID(), req, scope) - } - - // If we get here it means query could not be dispatched through matching directly, so it must block - // until either an result has been obtained on a decision task response or until it is safe to dispatch directly through matching. - sw := scope.StartTimer(metrics.DecisionTaskQueryLatency) - defer sw.Stop() - queryReg := mutableState.GetQueryRegistry() - if len(queryReg.GetBufferedIDs()) >= e.config.MaxBufferedQueryCount() { - scope.IncCounter(metrics.QueryBufferExceededCount) - return nil, workflow.ErrConsistentQueryBufferExceeded - } - queryID, termCh := queryReg.BufferQuery(req.GetQuery()) - defer queryReg.RemoveQuery(queryID) - release(nil) - select { - case <-termCh: - state, err := queryReg.GetTerminationState(queryID) - if err != nil { - scope.IncCounter(metrics.QueryRegistryInvalidStateCount) - return nil, err - } - switch state.TerminationType { - case query.TerminationTypeCompleted: - result := state.QueryResult - switch result.GetResultType() { - case types.QueryResultTypeAnswered: - return &types.HistoryQueryWorkflowResponse{ - Response: &types.QueryWorkflowResponse{ - QueryResult: result.GetAnswer(), - }, - }, nil - case types.QueryResultTypeFailed: - return nil, &types.QueryFailedError{Message: result.GetErrorMessage()} - default: - scope.IncCounter(metrics.QueryRegistryInvalidStateCount) - return nil, workflow.ErrQueryEnteredInvalidState - } - case query.TerminationTypeUnblocked: - msResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) - if err != nil { - return nil, err - } - req.Execution.RunID = msResp.Execution.RunID - return e.queryDirectlyThroughMatching(ctx, msResp, request.GetDomainUUID(), req, scope) - case query.TerminationTypeFailed: - return nil, state.Failure - default: - scope.IncCounter(metrics.QueryRegistryInvalidStateCount) - return nil, workflow.ErrQueryEnteredInvalidState - } - case <-ctx.Done(): - scope.IncCounter(metrics.ConsistentQueryTimeoutCount) - return nil, ctx.Err() - } -} - -func (e *historyEngineImpl) queryDirectlyThroughMatching( - ctx context.Context, - msResp *types.GetMutableStateResponse, - domainID string, - queryRequest *types.QueryWorkflowRequest, - scope metrics.Scope, -) (*types.HistoryQueryWorkflowResponse, error) { - - sw := scope.StartTimer(metrics.DirectQueryDispatchLatency) - defer sw.Stop() - - // Sticky task list is not very useful in the standby cluster because the decider cache is - // not updated by dispatching tasks to it (it is only updated in the case of query). - // Additionally on the standby side we are not even able to clear sticky. - // Stickiness might be outdated if the customer did a restart of their nodes causing a query - // dispatched on the standby side on sticky to hang. We decided it made sense to simply not attempt - // query on sticky task list at all on the passive side. - de, err := e.shard.GetDomainCache().GetDomainByID(domainID) - if err != nil { - return nil, err - } - supportsStickyQuery := e.clientChecker.SupportsStickyQuery(msResp.GetClientImpl(), msResp.GetClientFeatureVersion()) == nil - domainIsActive, _ := de.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()) - if msResp.GetIsStickyTaskListEnabled() && - len(msResp.GetStickyTaskList().GetName()) != 0 && - supportsStickyQuery && - e.config.EnableStickyQuery(queryRequest.GetDomain()) && - domainIsActive { - - stickyMatchingRequest := &types.MatchingQueryWorkflowRequest{ - DomainUUID: domainID, - QueryRequest: queryRequest, - TaskList: msResp.GetStickyTaskList(), - } - - // using a clean new context in case customer provide a context which has - // a really short deadline, causing we clear the stickiness - stickyContext, cancel := context.WithTimeout(context.Background(), time.Duration(msResp.GetStickyTaskListScheduleToStartTimeout())*time.Second) - stickyStopWatch := scope.StartTimer(metrics.DirectQueryDispatchStickyLatency) - matchingResp, err := e.rawMatchingClient.QueryWorkflow(stickyContext, stickyMatchingRequest) - stickyStopWatch.Stop() - cancel() - if err == nil { - scope.IncCounter(metrics.DirectQueryDispatchStickySuccessCount) - return &types.HistoryQueryWorkflowResponse{Response: matchingResp}, nil - } - switch v := err.(type) { - case *types.StickyWorkerUnavailableError: - case *yarpcerrors.Status: - if v.Code() != yarpcerrors.CodeDeadlineExceeded { - e.logger.Error("query directly though matching on sticky failed, will not attempt query on non-sticky", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), - tag.Error(err)) - return nil, err - } - default: - e.logger.Error("query directly though matching on sticky failed, will not attempt query on non-sticky", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), - tag.Error(err)) - return nil, err - } - if msResp.GetIsWorkflowRunning() { - e.logger.Info("query direct through matching failed on sticky, clearing sticky before attempting on non-sticky", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), - tag.Error(err)) - resetContext, cancel := context.WithTimeout(context.Background(), 5*time.Second) - clearStickinessStopWatch := scope.StartTimer(metrics.DirectQueryDispatchClearStickinessLatency) - _, err := e.ResetStickyTaskList(resetContext, &types.HistoryResetStickyTaskListRequest{ - DomainUUID: domainID, - Execution: queryRequest.GetExecution(), - }) - clearStickinessStopWatch.Stop() - cancel() - if err != nil && err != workflow.ErrAlreadyCompleted && err != workflow.ErrNotExists { - return nil, err - } - scope.IncCounter(metrics.DirectQueryDispatchClearStickinessSuccessCount) - } - } - - if err := common.IsValidContext(ctx); err != nil { - e.logger.Info("query context timed out before query on non-sticky task list could be attempted", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType())) - scope.IncCounter(metrics.DirectQueryDispatchTimeoutBeforeNonStickyCount) - return nil, err - } - - e.logger.Debug("query directly through matching on sticky timed out, attempting to query on non-sticky", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), - tag.WorkflowTaskListName(msResp.GetStickyTaskList().GetName()), - tag.WorkflowNextEventID(msResp.GetNextEventID())) - - nonStickyMatchingRequest := &types.MatchingQueryWorkflowRequest{ - DomainUUID: domainID, - QueryRequest: queryRequest, - TaskList: msResp.TaskList, - } - - nonStickyStopWatch := scope.StartTimer(metrics.DirectQueryDispatchNonStickyLatency) - matchingResp, err := e.matchingClient.QueryWorkflow(ctx, nonStickyMatchingRequest) - nonStickyStopWatch.Stop() - if err != nil { - e.logger.Error("query directly though matching on non-sticky failed", - tag.WorkflowDomainName(queryRequest.GetDomain()), - tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), - tag.WorkflowRunID(queryRequest.Execution.GetRunID()), - tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), - tag.Error(err)) - return nil, err - } - scope.IncCounter(metrics.DirectQueryDispatchNonStickySuccessCount) - return &types.HistoryQueryWorkflowResponse{Response: matchingResp}, err -} - -func (e *historyEngineImpl) getMutableState( - ctx context.Context, - domainID string, - execution types.WorkflowExecution, -) (retResp *types.GetMutableStateResponse, retError error) { - - wfContext, release, retError := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, execution) - if retError != nil { - return - } - defer func() { release(retError) }() - - mutableState, retError := wfContext.LoadWorkflowExecution(ctx) - if retError != nil { - return - } - - currentBranchToken, err := mutableState.GetCurrentBranchToken() - if err != nil { - return nil, err - } - - executionInfo := mutableState.GetExecutionInfo() - execution.RunID = wfContext.GetExecution().RunID - workflowState, workflowCloseState := mutableState.GetWorkflowStateCloseStatus() - retResp = &types.GetMutableStateResponse{ - Execution: &execution, - WorkflowType: &types.WorkflowType{Name: executionInfo.WorkflowTypeName}, - LastFirstEventID: mutableState.GetLastFirstEventID(), - NextEventID: mutableState.GetNextEventID(), - PreviousStartedEventID: common.Int64Ptr(mutableState.GetPreviousStartedEventID()), - TaskList: &types.TaskList{Name: executionInfo.TaskList}, - StickyTaskList: &types.TaskList{Name: executionInfo.StickyTaskList, Kind: types.TaskListKindSticky.Ptr()}, - ClientLibraryVersion: executionInfo.ClientLibraryVersion, - ClientFeatureVersion: executionInfo.ClientFeatureVersion, - ClientImpl: executionInfo.ClientImpl, - IsWorkflowRunning: mutableState.IsWorkflowExecutionRunning(), - StickyTaskListScheduleToStartTimeout: common.Int32Ptr(executionInfo.StickyScheduleToStartTimeout), - CurrentBranchToken: currentBranchToken, - WorkflowState: common.Int32Ptr(int32(workflowState)), - WorkflowCloseState: common.Int32Ptr(int32(workflowCloseState)), - IsStickyTaskListEnabled: mutableState.IsStickyTaskListEnabled(), - HistorySize: mutableState.GetHistorySize(), - } - versionHistories := mutableState.GetVersionHistories() - if versionHistories != nil { - retResp.VersionHistories = versionHistories.ToInternalType() - } - return -} - -func (e *historyEngineImpl) DescribeMutableState( - ctx context.Context, - request *types.DescribeMutableStateRequest, -) (response *types.DescribeMutableStateResponse, retError error) { - - if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { - return nil, err - } - - domainID := request.DomainUUID - execution := types.WorkflowExecution{ - WorkflowID: request.Execution.WorkflowID, - RunID: request.Execution.RunID, - } - - cacheCtx, dbCtx, release, cacheHit, err := e.executionCache.GetAndCreateWorkflowExecution( - ctx, domainID, execution, - ) - if err != nil { - return nil, err - } - defer func() { release(retError) }() - - response = &types.DescribeMutableStateResponse{} - - if cacheHit { - if msb := cacheCtx.GetWorkflowExecution(); msb != nil { - response.MutableStateInCache, err = e.toMutableStateJSON(msb) - if err != nil { - return nil, err - } - } - } - - msb, err := dbCtx.LoadWorkflowExecution(ctx) - if err != nil { - return nil, err - } - response.MutableStateInDatabase, err = e.toMutableStateJSON(msb) - if err != nil { - return nil, err - } - - return response, nil -} - -func (e *historyEngineImpl) toMutableStateJSON(msb execution.MutableState) (string, error) { - ms := msb.CopyToPersistence() - - jsonBytes, err := json.Marshal(ms) - if err != nil { - return "", err - } - return string(jsonBytes), nil -} - -// ResetStickyTaskList reset the volatile information in mutable state of a given types. -// Volatile information are the information related to client, such as: -// 1. StickyTaskList -// 2. StickyScheduleToStartTimeout -// 3. ClientLibraryVersion -// 4. ClientFeatureVersion -// 5. ClientImpl -func (e *historyEngineImpl) ResetStickyTaskList( - ctx context.Context, - resetRequest *types.HistoryResetStickyTaskListRequest, -) (*types.HistoryResetStickyTaskListResponse, error) { - - if err := common.ValidateDomainUUID(resetRequest.DomainUUID); err != nil { - return nil, err - } - domainID := resetRequest.DomainUUID - - err := workflow.UpdateWithAction(ctx, e.executionCache, domainID, *resetRequest.Execution, false, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrAlreadyCompleted - } - mutableState.ClearStickyness() - return nil - }, - ) - - if err != nil { - return nil, err - } - return &types.HistoryResetStickyTaskListResponse{}, nil -} - -// DescribeWorkflowExecution returns information about the specified workflow execution. -func (e *historyEngineImpl) DescribeWorkflowExecution( - ctx context.Context, - request *types.HistoryDescribeWorkflowExecutionRequest, -) (retResp *types.DescribeWorkflowExecutionResponse, retError error) { - - if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { - return nil, err - } - - domainID := request.DomainUUID - wfExecution := *request.Request.Execution - - wfContext, release, err0 := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, wfExecution) - if err0 != nil { - return nil, err0 - } - defer func() { release(retError) }() - - mutableState, err1 := wfContext.LoadWorkflowExecution(ctx) - if err1 != nil { - return nil, err1 - } - // If history is corrupted, return an error to the end user - if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { - return nil, err - } else if corrupted { - return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} - } - - executionInfo := mutableState.GetExecutionInfo() - - result := &types.DescribeWorkflowExecutionResponse{ - ExecutionConfiguration: &types.WorkflowExecutionConfiguration{ - TaskList: &types.TaskList{Name: executionInfo.TaskList}, - ExecutionStartToCloseTimeoutSeconds: common.Int32Ptr(executionInfo.WorkflowTimeout), - TaskStartToCloseTimeoutSeconds: common.Int32Ptr(executionInfo.DecisionStartToCloseTimeout), - }, - WorkflowExecutionInfo: &types.WorkflowExecutionInfo{ - Execution: &types.WorkflowExecution{ - WorkflowID: executionInfo.WorkflowID, - RunID: executionInfo.RunID, - }, - Type: &types.WorkflowType{Name: executionInfo.WorkflowTypeName}, - StartTime: common.Int64Ptr(executionInfo.StartTimestamp.UnixNano()), - HistoryLength: mutableState.GetNextEventID() - common.FirstEventID, - AutoResetPoints: executionInfo.AutoResetPoints, - Memo: &types.Memo{Fields: executionInfo.Memo}, - IsCron: len(executionInfo.CronSchedule) > 0, - UpdateTime: common.Int64Ptr(executionInfo.LastUpdatedTimestamp.UnixNano()), - SearchAttributes: &types.SearchAttributes{IndexedFields: executionInfo.SearchAttributes}, - PartitionConfig: executionInfo.PartitionConfig, - }, - } - - // TODO: we need to consider adding execution time to mutable state - // For now execution time will be calculated based on start time and cron schedule/retry policy - // each time DescribeWorkflowExecution is called. - startEvent, err := mutableState.GetStartEvent(ctx) - if err != nil { - return nil, err - } - backoffDuration := time.Duration(startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstDecisionTaskBackoffSeconds()) * time.Second - result.WorkflowExecutionInfo.ExecutionTime = common.Int64Ptr(result.WorkflowExecutionInfo.GetStartTime() + backoffDuration.Nanoseconds()) - - if executionInfo.ParentRunID != "" { - result.WorkflowExecutionInfo.ParentExecution = &types.WorkflowExecution{ - WorkflowID: executionInfo.ParentWorkflowID, - RunID: executionInfo.ParentRunID, - } - result.WorkflowExecutionInfo.ParentDomainID = common.StringPtr(executionInfo.ParentDomainID) - result.WorkflowExecutionInfo.ParentInitiatedID = common.Int64Ptr(executionInfo.InitiatedID) - parentDomain, err := e.shard.GetDomainCache().GetDomainName(executionInfo.ParentDomainID) - if err != nil { - return nil, err - } - result.WorkflowExecutionInfo.ParentDomain = common.StringPtr(parentDomain) - } - if executionInfo.State == persistence.WorkflowStateCompleted { - // for closed workflow - result.WorkflowExecutionInfo.CloseStatus = persistence.ToInternalWorkflowExecutionCloseStatus(executionInfo.CloseStatus) - completionEvent, err := mutableState.GetCompletionEvent(ctx) - if err != nil { - return nil, err - } - result.WorkflowExecutionInfo.CloseTime = common.Int64Ptr(completionEvent.GetTimestamp()) - } - - if len(mutableState.GetPendingActivityInfos()) > 0 { - for _, ai := range mutableState.GetPendingActivityInfos() { - p := &types.PendingActivityInfo{ - ActivityID: ai.ActivityID, - } - state := types.PendingActivityStateScheduled - if ai.CancelRequested { - state = types.PendingActivityStateCancelRequested - } else if ai.StartedID != common.EmptyEventID { - state = types.PendingActivityStateStarted - } - p.State = &state - lastHeartbeatUnixNano := ai.LastHeartBeatUpdatedTime.UnixNano() - if lastHeartbeatUnixNano > 0 { - p.LastHeartbeatTimestamp = common.Int64Ptr(lastHeartbeatUnixNano) - p.HeartbeatDetails = ai.Details - } - // TODO: move to mutable state instead of loading it from event - scheduledEvent, err := mutableState.GetActivityScheduledEvent(ctx, ai.ScheduleID) - if err != nil { - return nil, err - } - p.ActivityType = scheduledEvent.ActivityTaskScheduledEventAttributes.ActivityType - if state == types.PendingActivityStateScheduled { - p.ScheduledTimestamp = common.Int64Ptr(ai.ScheduledTime.UnixNano()) - } else { - p.LastStartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) - } - if ai.HasRetryPolicy { - p.Attempt = ai.Attempt - p.ExpirationTimestamp = common.Int64Ptr(ai.ExpirationTime.UnixNano()) - if ai.MaximumAttempts != 0 { - p.MaximumAttempts = ai.MaximumAttempts - } - if ai.LastFailureReason != "" { - p.LastFailureReason = common.StringPtr(ai.LastFailureReason) - p.LastFailureDetails = ai.LastFailureDetails - } - if ai.LastWorkerIdentity != "" { - p.LastWorkerIdentity = ai.LastWorkerIdentity - } - if ai.StartedIdentity != "" { - p.StartedWorkerIdentity = ai.StartedIdentity - } - } - result.PendingActivities = append(result.PendingActivities, p) - } - } - - if len(mutableState.GetPendingChildExecutionInfos()) > 0 { - for _, ch := range mutableState.GetPendingChildExecutionInfos() { - childDomainName, err := execution.GetChildExecutionDomainName( - ch, - e.shard.GetDomainCache(), - mutableState.GetDomainEntry(), - ) - if err != nil { - if !common.IsEntityNotExistsError(err) { - return nil, err - } - // child domain already deleted, instead of failing the request, - // return domainID instead since this field is only for information purpose - childDomainName = ch.DomainID - } - p := &types.PendingChildExecutionInfo{ - Domain: childDomainName, - WorkflowID: ch.StartedWorkflowID, - RunID: ch.StartedRunID, - WorkflowTypeName: ch.WorkflowTypeName, - InitiatedID: ch.InitiatedID, - ParentClosePolicy: &ch.ParentClosePolicy, - } - result.PendingChildren = append(result.PendingChildren, p) - } - } - - if di, ok := mutableState.GetPendingDecision(); ok { - pendingDecision := &types.PendingDecisionInfo{ - State: types.PendingDecisionStateScheduled.Ptr(), - ScheduledTimestamp: common.Int64Ptr(di.ScheduledTimestamp), - Attempt: di.Attempt, - OriginalScheduledTimestamp: common.Int64Ptr(di.OriginalScheduledTimestamp), - } - if di.StartedID != common.EmptyEventID { - pendingDecision.State = types.PendingDecisionStateStarted.Ptr() - pendingDecision.StartedTimestamp = common.Int64Ptr(di.StartedTimestamp) - } - result.PendingDecision = pendingDecision - } - - return result, nil -} - -func (e *historyEngineImpl) RecordActivityTaskStarted( - ctx context.Context, - request *types.RecordActivityTaskStartedRequest, -) (*types.RecordActivityTaskStartedResponse, error) { - - domainEntry, err := e.getActiveDomainByID(request.DomainUUID) - if err != nil { - return nil, err - } - - domainInfo := domainEntry.GetInfo() - - domainID := domainInfo.ID - domainName := domainInfo.Name - - workflowExecution := types.WorkflowExecution{ - WorkflowID: request.WorkflowExecution.WorkflowID, - RunID: request.WorkflowExecution.RunID, - } - - var resurrectError error - response := &types.RecordActivityTaskStartedResponse{} - err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrNotExists - } - - scheduleID := request.GetScheduleID() - requestID := request.GetRequestID() - ai, isRunning := mutableState.GetActivityInfo(scheduleID) - - // RecordActivityTaskStarted is already past scheduleToClose timeout. - // If at this point pending activity is still in mutable state it may be resurrected. - // Otherwise it would be completed or timed out already. - if isRunning && e.timeSource.Now().After(ai.ScheduledTime.Add(time.Duration(ai.ScheduleToCloseTimeout)*time.Second)) { - resurrectedActivities, err := execution.GetResurrectedActivities(ctx, e.shard, mutableState) - if err != nil { - e.logger.Error("Activity resurrection check failed", tag.Error(err)) - return err - } - - if _, ok := resurrectedActivities[scheduleID]; ok { - // found activity resurrection - domainName := mutableState.GetDomainEntry().GetInfo().Name - e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskStartedScope, metrics.ActivityResurrectionCounter) - e.logger.Error("Encounter resurrected activity, skip", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - ) - - // remove resurrected activity from mutable state - if err := mutableState.DeleteActivity(scheduleID); err != nil { - return err - } - - // save resurrection error but return nil here, so that mutable state would get updated in DB - resurrectError = workflow.ErrActivityTaskNotFound - return nil - } - } - - // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in - // some extreme cassandra failure cases. - if !isRunning && scheduleID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskStartedScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordActivityTaskStarted", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - - // Check execution state to make sure task is in the list of outstanding tasks and it is not yet started. If - // task is not outstanding than it is most probably a duplicate and complete the task. - if !isRunning { - // Looks like ActivityTask already completed as a result of another call. - // It is OK to drop the task at this point. - e.logger.Debug("Potentially duplicate task.", tag.TaskID(request.GetTaskID()), tag.WorkflowScheduleID(scheduleID), tag.TaskType(persistence.TransferTaskTypeActivityTask)) - return workflow.ErrActivityTaskNotFound - } - - scheduledEvent, err := mutableState.GetActivityScheduledEvent(ctx, scheduleID) - if err != nil { - return err - } - response.ScheduledEvent = scheduledEvent - response.ScheduledTimestampOfThisAttempt = common.Int64Ptr(ai.ScheduledTime.UnixNano()) - - response.Attempt = int64(ai.Attempt) - response.HeartbeatDetails = ai.Details - - response.WorkflowType = mutableState.GetWorkflowType() - response.WorkflowDomain = domainName - - if ai.StartedID != common.EmptyEventID { - // If activity is started as part of the current request scope then return a positive response - if ai.RequestID == requestID { - response.StartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) - return nil - } - - // Looks like ActivityTask already started as a result of another call. - // It is OK to drop the task at this point. - e.logger.Debug("Potentially duplicate task.", tag.TaskID(request.GetTaskID()), tag.WorkflowScheduleID(scheduleID), tag.TaskType(persistence.TransferTaskTypeActivityTask)) - return &types.EventAlreadyStartedError{Message: "Activity task already started."} - } - - if _, err := mutableState.AddActivityTaskStartedEvent( - ai, scheduleID, requestID, request.PollRequest.GetIdentity(), - ); err != nil { - return err - } - - response.StartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) - - return nil - }) - - if err != nil { - return nil, err - } - if resurrectError != nil { - return nil, resurrectError - } - - return response, err -} - -// ScheduleDecisionTask schedules a decision if no outstanding decision found -func (e *historyEngineImpl) ScheduleDecisionTask( - ctx context.Context, - req *types.ScheduleDecisionTaskRequest, -) error { - return e.decisionHandler.HandleDecisionTaskScheduled(ctx, req) -} - -// RecordDecisionTaskStarted starts a decision -func (e *historyEngineImpl) RecordDecisionTaskStarted( - ctx context.Context, - request *types.RecordDecisionTaskStartedRequest, -) (*types.RecordDecisionTaskStartedResponse, error) { - return e.decisionHandler.HandleDecisionTaskStarted(ctx, request) -} - -// RespondDecisionTaskCompleted completes a decision task -func (e *historyEngineImpl) RespondDecisionTaskCompleted( - ctx context.Context, - req *types.HistoryRespondDecisionTaskCompletedRequest, -) (*types.HistoryRespondDecisionTaskCompletedResponse, error) { - return e.decisionHandler.HandleDecisionTaskCompleted(ctx, req) -} - -// RespondDecisionTaskFailed fails a decision -func (e *historyEngineImpl) RespondDecisionTaskFailed( - ctx context.Context, - req *types.HistoryRespondDecisionTaskFailedRequest, -) error { - return e.decisionHandler.HandleDecisionTaskFailed(ctx, req) -} - -// RespondActivityTaskCompleted completes an activity task. -func (e *historyEngineImpl) RespondActivityTaskCompleted( - ctx context.Context, - req *types.HistoryRespondActivityTaskCompletedRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(req.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - domainName := domainEntry.GetInfo().Name - - request := req.CompleteRequest - token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) - if err0 != nil { - return workflow.ErrDeserializingToken - } - - workflowExecution := types.WorkflowExecution{ - WorkflowID: token.WorkflowID, - RunID: token.RunID, - } - - var activityStartedTime time.Time - var taskList string - err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrAlreadyCompleted - } - - scheduleID := token.ScheduleID - if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID - scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) - if err0 != nil { - return err0 - } - } - ai, isRunning := mutableState.GetActivityInfo(scheduleID) - - // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in - // some extreme cassandra failure cases. - if !isRunning && scheduleID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskCompletedScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordActivityTaskCompleted", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - - if !isRunning || ai.StartedID == common.EmptyEventID || - (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { - e.logger.Warn(fmt.Sprintf( - "Encounter non existing activity in RecordActivityTaskCompleted: isRunning: %t, ai: %#v, token: %#v.", - isRunning, ai, token), - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrActivityTaskNotFound - } - - if _, err := mutableState.AddActivityTaskCompletedEvent(scheduleID, ai.StartedID, request); err != nil { - // Unable to add ActivityTaskCompleted event to history - return &types.InternalServiceError{Message: "Unable to add ActivityTaskCompleted event to history."} - } - activityStartedTime = ai.StartedTime - taskList = ai.TaskList - return nil - }) - if err == nil && !activityStartedTime.IsZero() { - scope := e.metricsClient.Scope(metrics.HistoryRespondActivityTaskCompletedScope). - Tagged( - metrics.DomainTag(domainName), - metrics.WorkflowTypeTag(token.WorkflowType), - metrics.ActivityTypeTag(token.ActivityType), - metrics.TaskListTag(taskList), - ) - scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) - } - return err -} - -// RespondActivityTaskFailed completes an activity task failure. -func (e *historyEngineImpl) RespondActivityTaskFailed( - ctx context.Context, - req *types.HistoryRespondActivityTaskFailedRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(req.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - domainName := domainEntry.GetInfo().Name - - request := req.FailedRequest - token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) - if err0 != nil { - return workflow.ErrDeserializingToken - } - - workflowExecution := types.WorkflowExecution{ - WorkflowID: token.WorkflowID, - RunID: token.RunID, - } - - var activityStartedTime time.Time - var taskList string - err = workflow.UpdateWithActionFunc( - ctx, - e.executionCache, - domainID, - workflowExecution, - e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { - if !mutableState.IsWorkflowExecutionRunning() { - return nil, workflow.ErrAlreadyCompleted - } - - scheduleID := token.ScheduleID - if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID - scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) - if err0 != nil { - return nil, err0 - } - } - ai, isRunning := mutableState.GetActivityInfo(scheduleID) - - // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in - // some extreme cassandra failure cases. - if !isRunning && scheduleID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskFailedScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordActivityTaskFailed", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return nil, workflow.ErrStaleState - } - - if !isRunning || ai.StartedID == common.EmptyEventID || - (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { - e.logger.Warn(fmt.Sprintf( - "Encounter non existing activity in RecordActivityTaskFailed: isRunning: %t, ai: %#v, token: %#v.", - isRunning, ai, token), - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return nil, workflow.ErrActivityTaskNotFound - } - - postActions := &workflow.UpdateAction{} - ok, err := mutableState.RetryActivity(ai, req.FailedRequest.GetReason(), req.FailedRequest.GetDetails()) - if err != nil { - return nil, err - } - if !ok { - // no more retry, and we want to record the failure event - if _, err := mutableState.AddActivityTaskFailedEvent(scheduleID, ai.StartedID, request); err != nil { - // Unable to add ActivityTaskFailed event to history - return nil, &types.InternalServiceError{Message: "Unable to add ActivityTaskFailed event to history."} - } - postActions.CreateDecision = true - } - - activityStartedTime = ai.StartedTime - taskList = ai.TaskList - return postActions, nil - }, - ) - if err == nil && !activityStartedTime.IsZero() { - scope := e.metricsClient.Scope(metrics.HistoryRespondActivityTaskFailedScope). - Tagged( - metrics.DomainTag(domainName), - metrics.WorkflowTypeTag(token.WorkflowType), - metrics.ActivityTypeTag(token.ActivityType), - metrics.TaskListTag(taskList), - ) - scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) - } - return err -} - -// RespondActivityTaskCanceled completes an activity task failure. -func (e *historyEngineImpl) RespondActivityTaskCanceled( - ctx context.Context, - req *types.HistoryRespondActivityTaskCanceledRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(req.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - domainName := domainEntry.GetInfo().Name - - request := req.CancelRequest - token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) - if err0 != nil { - return workflow.ErrDeserializingToken - } - - workflowExecution := types.WorkflowExecution{ - WorkflowID: token.WorkflowID, - RunID: token.RunID, - } - - var activityStartedTime time.Time - var taskList string - err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrAlreadyCompleted - } - - scheduleID := token.ScheduleID - if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID - scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) - if err0 != nil { - return err0 - } - } - ai, isRunning := mutableState.GetActivityInfo(scheduleID) - - // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in - // some extreme cassandra failure cases. - if !isRunning && scheduleID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskCanceledScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordActivityTaskCanceled", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - - if !isRunning || ai.StartedID == common.EmptyEventID || - (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { - return workflow.ErrActivityTaskNotFound - } - - if _, err := mutableState.AddActivityTaskCanceledEvent( - scheduleID, - ai.StartedID, - ai.CancelRequestID, - request.Details, - request.Identity); err != nil { - // Unable to add ActivityTaskCanceled event to history - return &types.InternalServiceError{Message: "Unable to add ActivityTaskCanceled event to history."} - } - - activityStartedTime = ai.StartedTime - taskList = ai.TaskList - return nil - }) - if err == nil && !activityStartedTime.IsZero() { - scope := e.metricsClient.Scope(metrics.HistoryClientRespondActivityTaskCanceledScope). - Tagged( - metrics.DomainTag(domainName), - metrics.WorkflowTypeTag(token.WorkflowType), - metrics.ActivityTypeTag(token.ActivityType), - metrics.TaskListTag(taskList), - ) - scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) - } - return err -} - -// RecordActivityTaskHeartbeat records an heartbeat for a task. -// This method can be used for two purposes. -// - For reporting liveness of the activity. -// - For reporting progress of the activity, this can be done even if the liveness is not configured. -func (e *historyEngineImpl) RecordActivityTaskHeartbeat( - ctx context.Context, - req *types.HistoryRecordActivityTaskHeartbeatRequest, -) (*types.RecordActivityTaskHeartbeatResponse, error) { - - domainEntry, err := e.getActiveDomainByID(req.DomainUUID) - if err != nil { - return nil, err - } - domainID := domainEntry.GetInfo().ID - - request := req.HeartbeatRequest - token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) - if err0 != nil { - return nil, workflow.ErrDeserializingToken - } - - workflowExecution := types.WorkflowExecution{ - WorkflowID: token.WorkflowID, - RunID: token.RunID, - } - - var cancelRequested bool - err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - e.logger.Debug("Heartbeat failed") - return workflow.ErrAlreadyCompleted - } - - scheduleID := token.ScheduleID - if scheduleID == common.EmptyEventID { // client call RecordActivityHeartbeatByID, so get scheduleID by activityID - scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) - if err0 != nil { - return err0 - } - } - ai, isRunning := mutableState.GetActivityInfo(scheduleID) - - // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in - // some extreme cassandra failure cases. - if !isRunning && scheduleID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskHeartbeatScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordActivityTaskHeartbeat", - tag.WorkflowDomainName(domainEntry.GetInfo().Name), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - - if !isRunning || ai.StartedID == common.EmptyEventID || - (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { - e.logger.Warn(fmt.Sprintf( - "Encounter non existing activity in RecordActivityTaskHeartbeat: isRunning: %t, ai: %#v, token: %#v.", - isRunning, ai, token), - tag.WorkflowDomainName(domainEntry.GetInfo().Name), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowScheduleID(scheduleID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - - return workflow.ErrActivityTaskNotFound - } - - cancelRequested = ai.CancelRequested - - e.logger.Debug(fmt.Sprintf("Activity HeartBeat: scheduleEventID: %v, ActivityInfo: %+v, CancelRequested: %v", - scheduleID, ai, cancelRequested)) - - // Save progress and last HB reported time. - mutableState.UpdateActivityProgress(ai, request) - - return nil - }) - - if err != nil { - return &types.RecordActivityTaskHeartbeatResponse{}, err - } - - return &types.RecordActivityTaskHeartbeatResponse{CancelRequested: cancelRequested}, nil -} - -// RequestCancelWorkflowExecution records request cancellation event for workflow execution -func (e *historyEngineImpl) RequestCancelWorkflowExecution( - ctx context.Context, - req *types.HistoryRequestCancelWorkflowExecutionRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(req.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - - request := req.CancelRequest - parentExecution := req.ExternalWorkflowExecution - childWorkflowOnly := req.GetChildWorkflowOnly() - workflowExecution := types.WorkflowExecution{ - WorkflowID: request.WorkflowExecution.WorkflowID, - } - // If firstExecutionRunID is set on the request always try to cancel currently running execution - if request.GetFirstExecutionRunID() == "" { - workflowExecution.RunID = request.WorkflowExecution.RunID - } - - return workflow.UpdateCurrentWithActionFunc(ctx, e.executionCache, e.executionManager, domainID, e.shard.GetDomainCache(), workflowExecution, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { - isCancelRequested, cancelRequestID := mutableState.IsCancelRequested() - if !mutableState.IsWorkflowExecutionRunning() { - _, closeStatus := mutableState.GetWorkflowStateCloseStatus() - if isCancelRequested && closeStatus == persistence.WorkflowCloseStatusCanceled { - cancelRequest := req.CancelRequest - if cancelRequest.RequestID != "" && cancelRequest.RequestID == cancelRequestID { - return &workflow.UpdateAction{Noop: true}, nil - } - } - return nil, workflow.ErrAlreadyCompleted - } - - executionInfo := mutableState.GetExecutionInfo() - if request.GetFirstExecutionRunID() != "" { - firstRunID := executionInfo.FirstExecutionRunID - if firstRunID == "" { - // This is needed for backwards compatibility. Workflow execution create with Cadence release v0.25.0 or earlier - // does not have FirstExecutionRunID stored as part of mutable state. If this is not set then load it from - // workflow execution started event. - startEvent, err := mutableState.GetStartEvent(ctx) - if err != nil { - return nil, err - } - firstRunID = startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstExecutionRunID() - } - if request.GetFirstExecutionRunID() != firstRunID { - return nil, &types.EntityNotExistsError{Message: "Workflow execution not found"} - } - } - if childWorkflowOnly { - parentWorkflowID := executionInfo.ParentWorkflowID - parentRunID := executionInfo.ParentRunID - if parentExecution.GetWorkflowID() != parentWorkflowID || - parentExecution.GetRunID() != parentRunID { - return nil, workflow.ErrParentMismatch - } - } - - if isCancelRequested { - cancelRequest := req.CancelRequest - if cancelRequest.RequestID != "" && cancelRequest.RequestID == cancelRequestID { - return workflow.UpdateWithNewDecision, nil - } - // if we consider workflow cancellation idempotent, then this error is redundant - // this error maybe useful if this API is invoked by external, not decision from transfer queue - return nil, workflow.ErrCancellationAlreadyRequested - } - - if _, err := mutableState.AddWorkflowExecutionCancelRequestedEvent(req.CancelRequest.Cause, req); err != nil { - return nil, &types.InternalServiceError{Message: "Unable to cancel workflow execution."} - } - - return workflow.UpdateWithNewDecision, nil - }) -} - -func (e *historyEngineImpl) SignalWorkflowExecution( - ctx context.Context, - signalRequest *types.HistorySignalWorkflowExecutionRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(signalRequest.DomainUUID) - if err != nil { - return err - } - if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { - return errDomainDeprecated - } - domainID := domainEntry.GetInfo().ID - request := signalRequest.SignalRequest - parentExecution := signalRequest.ExternalWorkflowExecution - childWorkflowOnly := signalRequest.GetChildWorkflowOnly() - workflowExecution := types.WorkflowExecution{ - WorkflowID: request.WorkflowExecution.WorkflowID, - RunID: request.WorkflowExecution.RunID, - } - - return workflow.UpdateCurrentWithActionFunc( - ctx, - e.executionCache, - e.executionManager, - domainID, - e.shard.GetDomainCache(), - workflowExecution, - e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { - // first deduplicate by request id for signal decision - // this is done before workflow running check so that already completed error - // won't be returned for duplicated signals even if the workflow is closed. - if requestID := request.GetRequestID(); requestID != "" { - if mutableState.IsSignalRequested(requestID) { - return &workflow.UpdateAction{ - Noop: true, - CreateDecision: false, - }, nil - } - } - - if !mutableState.IsWorkflowExecutionRunning() { - return nil, workflow.ErrAlreadyCompleted - } - - // If history is corrupted, signal will be rejected - if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { - return nil, err - } else if corrupted { - return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} - } - - executionInfo := mutableState.GetExecutionInfo() - createDecisionTask := true - // Do not create decision task when the workflow is cron and the cron has not been started yet - if mutableState.GetExecutionInfo().CronSchedule != "" && !mutableState.HasProcessedOrPendingDecision() { - createDecisionTask = false - } - - maxAllowedSignals := e.config.MaximumSignalsPerExecution(domainEntry.GetInfo().Name) - if maxAllowedSignals > 0 && int(executionInfo.SignalCount) >= maxAllowedSignals { - e.logger.Info("Execution limit reached for maximum signals", tag.WorkflowSignalCount(executionInfo.SignalCount), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowDomainID(domainID)) - return nil, workflow.ErrSignalsLimitExceeded - } - - if childWorkflowOnly { - parentWorkflowID := executionInfo.ParentWorkflowID - parentRunID := executionInfo.ParentRunID - if parentExecution.GetWorkflowID() != parentWorkflowID || - parentExecution.GetRunID() != parentRunID { - return nil, workflow.ErrParentMismatch - } - } - - if requestID := request.GetRequestID(); requestID != "" { - mutableState.AddSignalRequested(requestID) - } - - if _, err := mutableState.AddWorkflowExecutionSignaled( - request.GetSignalName(), - request.GetInput(), - request.GetIdentity(), - request.GetRequestID(), - ); err != nil { - return nil, &types.InternalServiceError{Message: "Unable to signal workflow execution."} - } - - return &workflow.UpdateAction{ - Noop: false, - CreateDecision: createDecisionTask, - }, nil - }) -} - -func (e *historyEngineImpl) SignalWithStartWorkflowExecution( - ctx context.Context, - signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, -) (retResp *types.StartWorkflowExecutionResponse, retError error) { - - domainEntry, err := e.getActiveDomainByID(signalWithStartRequest.DomainUUID) - if err != nil { - return nil, err - } - if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { - return nil, errDomainDeprecated - } - domainID := domainEntry.GetInfo().ID - - sRequest := signalWithStartRequest.SignalWithStartRequest - workflowExecution := types.WorkflowExecution{ - WorkflowID: sRequest.WorkflowID, - } - - var prevMutableState execution.MutableState - attempt := 0 - - wfContext, release, err0 := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, workflowExecution) - - if err0 == nil { - defer func() { release(retError) }() - Just_Signal_Loop: - for ; attempt < workflow.ConditionalRetryCount; attempt++ { - // workflow not exist, will create workflow then signal - mutableState, err1 := wfContext.LoadWorkflowExecution(ctx) - if err1 != nil { - if _, ok := err1.(*types.EntityNotExistsError); ok { - break - } - return nil, err1 - } - - if mutableState.IsSignalRequested(sRequest.GetRequestID()) { - return &types.StartWorkflowExecutionResponse{RunID: wfContext.GetExecution().RunID}, nil - } - - // workflow exist but not running, will restart workflow then signal - if !mutableState.IsWorkflowExecutionRunning() { - prevMutableState = mutableState - break - } - - // workflow exists but history is corrupted, will restart workflow then signal - if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { - return nil, err - } else if corrupted { - prevMutableState = mutableState - break - } - - // workflow is running, if policy is TerminateIfRunning, terminate current run then signalWithStart - if sRequest.GetWorkflowIDReusePolicy() == types.WorkflowIDReusePolicyTerminateIfRunning { - workflowExecution.RunID = uuid.New() - runningWFCtx := workflow.NewContext(wfContext, release, mutableState) - resp, errTerm := e.terminateAndStartWorkflow( - ctx, - runningWFCtx, - workflowExecution, - domainEntry, - domainID, - nil, - signalWithStartRequest, - ) - // By the time we try to terminate the workflow, it was already terminated - // So continue as if we didn't need to terminate it in the first place - if _, ok := errTerm.(*types.WorkflowExecutionAlreadyCompletedError); !ok { - return resp, errTerm - } - } - - executionInfo := mutableState.GetExecutionInfo() - maxAllowedSignals := e.config.MaximumSignalsPerExecution(domainEntry.GetInfo().Name) - if maxAllowedSignals > 0 && int(executionInfo.SignalCount) >= maxAllowedSignals { - e.logger.Info("Execution limit reached for maximum signals", tag.WorkflowSignalCount(executionInfo.SignalCount), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowDomainID(domainID)) - return nil, workflow.ErrSignalsLimitExceeded - } - - requestID := sRequest.GetRequestID() - if requestID != "" { - mutableState.AddSignalRequested(requestID) - } - - if _, err := mutableState.AddWorkflowExecutionSignaled( - sRequest.GetSignalName(), - sRequest.GetSignalInput(), - sRequest.GetIdentity(), - sRequest.GetRequestID(), - ); err != nil { - return nil, &types.InternalServiceError{Message: "Unable to signal workflow execution."} - } - - // Create a transfer task to schedule a decision task - if !mutableState.HasPendingDecision() { - _, err := mutableState.AddDecisionTaskScheduledEvent(false) - if err != nil { - return nil, &types.InternalServiceError{Message: "Failed to add decision scheduled event."} - } - } - - // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload - // the history and try the operation again. - if err := wfContext.UpdateWorkflowExecutionAsActive(ctx, e.shard.GetTimeSource().Now()); err != nil { - if t, ok := persistence.AsDuplicateRequestError(err); ok { - if t.RequestType == persistence.WorkflowRequestTypeSignal { - return &types.StartWorkflowExecutionResponse{RunID: t.RunID}, nil - } - e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) - return nil, t - } - if execution.IsConflictError(err) { - continue Just_Signal_Loop - } - return nil, err - } - return &types.StartWorkflowExecutionResponse{RunID: wfContext.GetExecution().RunID}, nil - } // end for Just_Signal_Loop - if attempt == workflow.ConditionalRetryCount { - return nil, workflow.ErrMaxAttemptsExceeded - } - } else { - if _, ok := err0.(*types.EntityNotExistsError); !ok { - return nil, err0 - } - // workflow not exist, will create workflow then signal - } - - // Start workflow and signal - startRequest, err := getStartRequest(domainID, sRequest, signalWithStartRequest.PartitionConfig) - if err != nil { - return nil, err - } - - sigWithStartArg := &signalWithStartArg{ - signalWithStartRequest: signalWithStartRequest, - prevMutableState: prevMutableState, - } - return e.startWorkflowHelper( - ctx, - startRequest, - domainEntry, - metrics.HistorySignalWithStartWorkflowExecutionScope, - sigWithStartArg, - ) -} - -func (e *historyEngineImpl) checkForHistoryCorruptions(ctx context.Context, mutableState execution.MutableState) (bool, error) { - domainName := mutableState.GetDomainEntry().GetInfo().Name - if !e.config.EnableHistoryCorruptionCheck(domainName) { - return false, nil - } - - // Ensure that we can obtain start event. Failing to do so means corrupted history or resurrected mutable state record. - _, err := mutableState.GetStartEvent(ctx) - if err != nil { - info := mutableState.GetExecutionInfo() - // Mark workflow as corrupted. So that new one can be restarted. - info.State = persistence.WorkflowStateCorrupted - - e.logger.Error("history corruption check failed", - tag.WorkflowDomainName(domainName), - tag.WorkflowID(info.WorkflowID), - tag.WorkflowRunID(info.RunID), - tag.WorkflowType(info.WorkflowTypeName), - tag.Error(err)) - - if errors.Is(err, execution.ErrMissingWorkflowStartEvent) { - return true, nil - } - return false, err - } - - return false, nil -} - -// RemoveSignalMutableState remove the signal request id in signal_requested for deduplicate -func (e *historyEngineImpl) RemoveSignalMutableState( - ctx context.Context, - request *types.RemoveSignalMutableStateRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(request.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - - workflowExecution := types.WorkflowExecution{ - WorkflowID: request.WorkflowExecution.WorkflowID, - RunID: request.WorkflowExecution.RunID, - } - - return workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrNotExists - } - - mutableState.DeleteSignalRequested(request.GetRequestID()) - - return nil - }) -} - -func (e *historyEngineImpl) TerminateWorkflowExecution( - ctx context.Context, - terminateRequest *types.HistoryTerminateWorkflowExecutionRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(terminateRequest.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - - request := terminateRequest.TerminateRequest - parentExecution := terminateRequest.ExternalWorkflowExecution - childWorkflowOnly := terminateRequest.GetChildWorkflowOnly() - workflowExecution := types.WorkflowExecution{ - WorkflowID: request.WorkflowExecution.WorkflowID, - } - // If firstExecutionRunID is set on the request always try to cancel currently running execution - if request.GetFirstExecutionRunID() == "" { - workflowExecution.RunID = request.WorkflowExecution.RunID - } - - return workflow.UpdateCurrentWithActionFunc( - ctx, - e.executionCache, - e.executionManager, - domainID, - e.shard.GetDomainCache(), - workflowExecution, - e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { - if !mutableState.IsWorkflowExecutionRunning() { - return nil, workflow.ErrAlreadyCompleted - } - - executionInfo := mutableState.GetExecutionInfo() - if request.GetFirstExecutionRunID() != "" { - firstRunID := executionInfo.FirstExecutionRunID - if firstRunID == "" { - // This is needed for backwards compatibility. Workflow execution create with Cadence release v0.25.0 or earlier - // does not have FirstExecutionRunID stored as part of mutable state. If this is not set then load it from - // workflow execution started event. - startEvent, err := mutableState.GetStartEvent(ctx) - if err != nil { - return nil, err - } - firstRunID = startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstExecutionRunID() - } - if request.GetFirstExecutionRunID() != firstRunID { - return nil, &types.EntityNotExistsError{Message: "Workflow execution not found"} - } - } - if childWorkflowOnly { - parentWorkflowID := executionInfo.ParentWorkflowID - parentRunID := executionInfo.ParentRunID - if parentExecution.GetWorkflowID() != parentWorkflowID || - parentExecution.GetRunID() != parentRunID { - return nil, workflow.ErrParentMismatch - } - } - - eventBatchFirstEventID := mutableState.GetNextEventID() - return workflow.UpdateWithoutDecision, execution.TerminateWorkflow( - mutableState, - eventBatchFirstEventID, - request.GetReason(), - request.GetDetails(), - request.GetIdentity(), - ) - }) -} - -// RecordChildExecutionCompleted records the completion of child execution into parent execution history -func (e *historyEngineImpl) RecordChildExecutionCompleted( - ctx context.Context, - completionRequest *types.RecordChildExecutionCompletedRequest, -) error { - - domainEntry, err := e.getActiveDomainByID(completionRequest.DomainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - - workflowExecution := types.WorkflowExecution{ - WorkflowID: completionRequest.WorkflowExecution.GetWorkflowID(), - RunID: completionRequest.WorkflowExecution.GetRunID(), - } - - return e.updateWithActionFn(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) error { - if !mutableState.IsWorkflowExecutionRunning() { - return workflow.ErrNotExists - } - - initiatedID := completionRequest.InitiatedID - startedID := completionRequest.StartedID - completedExecution := completionRequest.CompletedExecution - completionEvent := completionRequest.CompletionEvent - - // Check mutable state to make sure child execution is in pending child executions - ci, isRunning := mutableState.GetChildExecutionInfo(initiatedID) - if !isRunning { - if initiatedID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRecordChildExecutionCompletedScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordChildExecutionCompleted", - tag.WorkflowDomainName(domainEntry.GetInfo().Name), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowInitiatedID(initiatedID), - tag.WorkflowStartedID(startedID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - return &types.EntityNotExistsError{Message: "Pending child execution not found."} - } - if ci.StartedID == common.EmptyEventID { - if startedID >= mutableState.GetNextEventID() { - e.metricsClient.IncCounter(metrics.HistoryRecordChildExecutionCompletedScope, metrics.StaleMutableStateCounter) - e.logger.Error("Encounter stale mutable state in RecordChildExecutionCompleted", - tag.WorkflowDomainName(domainEntry.GetInfo().Name), - tag.WorkflowID(workflowExecution.GetWorkflowID()), - tag.WorkflowRunID(workflowExecution.GetRunID()), - tag.WorkflowInitiatedID(initiatedID), - tag.WorkflowStartedID(startedID), - tag.WorkflowNextEventID(mutableState.GetNextEventID()), - ) - return workflow.ErrStaleState - } - return &types.EntityNotExistsError{Message: "Pending child execution not started."} - } - if ci.StartedWorkflowID != completedExecution.GetWorkflowID() { - return &types.EntityNotExistsError{Message: "Pending child execution workflowID mismatch."} - } - - switch *completionEvent.EventType { - case types.EventTypeWorkflowExecutionCompleted: - attributes := completionEvent.WorkflowExecutionCompletedEventAttributes - _, err = mutableState.AddChildWorkflowExecutionCompletedEvent(initiatedID, completedExecution, attributes) - case types.EventTypeWorkflowExecutionFailed: - attributes := completionEvent.WorkflowExecutionFailedEventAttributes - _, err = mutableState.AddChildWorkflowExecutionFailedEvent(initiatedID, completedExecution, attributes) - case types.EventTypeWorkflowExecutionCanceled: - attributes := completionEvent.WorkflowExecutionCanceledEventAttributes - _, err = mutableState.AddChildWorkflowExecutionCanceledEvent(initiatedID, completedExecution, attributes) - case types.EventTypeWorkflowExecutionTerminated: - attributes := completionEvent.WorkflowExecutionTerminatedEventAttributes - _, err = mutableState.AddChildWorkflowExecutionTerminatedEvent(initiatedID, completedExecution, attributes) - case types.EventTypeWorkflowExecutionTimedOut: - attributes := completionEvent.WorkflowExecutionTimedOutEventAttributes - _, err = mutableState.AddChildWorkflowExecutionTimedOutEvent(initiatedID, completedExecution, attributes) - } - return err - }) -} - -func (e *historyEngineImpl) ReplicateEventsV2( - ctx context.Context, - replicateRequest *types.ReplicateEventsV2Request, -) error { - - return e.nDCReplicator.ApplyEvents(ctx, replicateRequest) -} - -func (e *historyEngineImpl) SyncShardStatus( - ctx context.Context, - request *types.SyncShardStatusRequest, -) error { - - clusterName := request.GetSourceCluster() - now := time.Unix(0, request.GetTimestamp()) - - // here there are 3 main things - // 1. update the view of remote cluster's shard time - // 2. notify the timer gate in the timer queue standby processor - // 3. notify the transfer (essentially a no op, just put it here so it looks symmetric) - // 4. notify the cross cluster (essentially a no op, just put it here so it looks symmetric) - e.shard.SetCurrentTime(clusterName, now) - e.txProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) - e.timerProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) - e.crossClusterProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) - return nil -} - -func (e *historyEngineImpl) SyncActivity( - ctx context.Context, - request *types.SyncActivityRequest, -) (retError error) { - - return e.nDCActivityReplicator.SyncActivity(ctx, request) -} - -func (e *historyEngineImpl) ResetWorkflowExecution( - ctx context.Context, - resetRequest *types.HistoryResetWorkflowExecutionRequest, -) (response *types.ResetWorkflowExecutionResponse, retError error) { - - request := resetRequest.ResetRequest - domainID := resetRequest.GetDomainUUID() - workflowID := request.WorkflowExecution.GetWorkflowID() - baseRunID := request.WorkflowExecution.GetRunID() - - baseContext, baseReleaseFn, err := e.executionCache.GetOrCreateWorkflowExecution( - ctx, - domainID, - types.WorkflowExecution{ - WorkflowID: workflowID, - RunID: baseRunID, - }, - ) - if err != nil { - return nil, err - } - defer func() { baseReleaseFn(retError) }() - - baseMutableState, err := baseContext.LoadWorkflowExecution(ctx) - if err != nil { - return nil, err - } - if ok := baseMutableState.HasProcessedOrPendingDecision(); !ok { - return nil, &types.BadRequestError{ - Message: "Cannot reset workflow without a decision task schedule.", - } - } - if request.GetDecisionFinishEventID() <= common.FirstEventID || - request.GetDecisionFinishEventID() > baseMutableState.GetNextEventID() { - return nil, &types.BadRequestError{ - Message: "Decision finish ID must be > 1 && <= workflow next event ID.", - } - } - domainName, err := e.shard.GetDomainCache().GetDomainName(domainID) - if err != nil { - return nil, err - } - // also load the current run of the workflow, it can be different from the base runID - resp, err := e.executionManager.GetCurrentExecution(ctx, &persistence.GetCurrentExecutionRequest{ - DomainID: domainID, - WorkflowID: request.WorkflowExecution.GetWorkflowID(), - DomainName: domainName, - }) - if err != nil { - return nil, err - } - - currentRunID := resp.RunID - var currentContext execution.Context - var currentMutableState execution.MutableState - var currentReleaseFn execution.ReleaseFunc - if currentRunID == baseRunID { - currentContext = baseContext - currentMutableState = baseMutableState - } else { - currentContext, currentReleaseFn, err = e.executionCache.GetOrCreateWorkflowExecution( - ctx, - domainID, - types.WorkflowExecution{ - WorkflowID: workflowID, - RunID: currentRunID, - }, - ) - if err != nil { - return nil, err - } - defer func() { currentReleaseFn(retError) }() - - currentMutableState, err = currentContext.LoadWorkflowExecution(ctx) - if err != nil { - return nil, err - } - } - - // dedup by requestID - if currentMutableState.GetExecutionInfo().CreateRequestID == request.GetRequestID() { - e.logger.Info("Duplicated reset request", - tag.WorkflowID(workflowID), - tag.WorkflowRunID(currentRunID), - tag.WorkflowDomainID(domainID)) - return &types.ResetWorkflowExecutionResponse{ - RunID: currentRunID, - }, nil - } - - resetRunID := uuid.New() - baseRebuildLastEventID := request.GetDecisionFinishEventID() - 1 - baseVersionHistories := baseMutableState.GetVersionHistories() - baseCurrentBranchToken, err := baseMutableState.GetCurrentBranchToken() - if err != nil { - return nil, err - } - baseRebuildLastEventVersion := baseMutableState.GetCurrentVersion() - baseNextEventID := baseMutableState.GetNextEventID() - - if baseVersionHistories != nil { - baseCurrentVersionHistory, err := baseVersionHistories.GetCurrentVersionHistory() - if err != nil { - return nil, err - } - baseRebuildLastEventVersion, err = baseCurrentVersionHistory.GetEventVersion(baseRebuildLastEventID) - if err != nil { - return nil, err - } - baseCurrentBranchToken = baseCurrentVersionHistory.GetBranchToken() - } - - if err := e.workflowResetter.ResetWorkflow( - ctx, - domainID, - workflowID, - baseRunID, - baseCurrentBranchToken, - baseRebuildLastEventID, - baseRebuildLastEventVersion, - baseNextEventID, - resetRunID, - request.GetRequestID(), - execution.NewWorkflow( - ctx, - e.shard.GetClusterMetadata(), - currentContext, - currentMutableState, - currentReleaseFn, - ), - request.GetReason(), - nil, - request.GetSkipSignalReapply(), - ); err != nil { - if t, ok := persistence.AsDuplicateRequestError(err); ok { - if t.RequestType == persistence.WorkflowRequestTypeReset { - return &types.ResetWorkflowExecutionResponse{ - RunID: t.RunID, - }, nil - } - e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) - return nil, t - } - return nil, err - } - return &types.ResetWorkflowExecutionResponse{ - RunID: resetRunID, - }, nil -} - -func (e *historyEngineImpl) NotifyNewHistoryEvent(event *events.Notification) { - e.historyEventNotifier.NotifyNewHistoryEvent(event) -} - -func (e *historyEngineImpl) NotifyNewTransferTasks(info *hcommon.NotifyTaskInfo) { - if len(info.Tasks) == 0 { - return - } - - task := info.Tasks[0] - clusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(task.GetVersion()) - if err == nil { - e.txProcessor.NotifyNewTask(clusterName, info) - } -} - -func (e *historyEngineImpl) NotifyNewTimerTasks(info *hcommon.NotifyTaskInfo) { - if len(info.Tasks) == 0 { - return - } - - task := info.Tasks[0] - clusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(task.GetVersion()) - if err == nil { - e.timerProcessor.NotifyNewTask(clusterName, info) - } -} - -func (e *historyEngineImpl) NotifyNewCrossClusterTasks(info *hcommon.NotifyTaskInfo) { - taskByTargetCluster := make(map[string][]persistence.Task) - for _, task := range info.Tasks { - // TODO: consider defining a new interface in persistence package - // for cross cluster tasks and add a method for returning the target cluster - var targetCluster string - switch crossClusterTask := task.(type) { - case *persistence.CrossClusterStartChildExecutionTask: - targetCluster = crossClusterTask.TargetCluster - case *persistence.CrossClusterCancelExecutionTask: - targetCluster = crossClusterTask.TargetCluster - case *persistence.CrossClusterSignalExecutionTask: - targetCluster = crossClusterTask.TargetCluster - case *persistence.CrossClusterRecordChildExecutionCompletedTask: - targetCluster = crossClusterTask.TargetCluster - case *persistence.CrossClusterApplyParentClosePolicyTask: - targetCluster = crossClusterTask.TargetCluster - default: - panic("encountered unknown cross cluster task type") - } - taskByTargetCluster[targetCluster] = append(taskByTargetCluster[targetCluster], task) - } - - for targetCluster, tasks := range taskByTargetCluster { - e.crossClusterProcessor.NotifyNewTask(targetCluster, &hcommon.NotifyTaskInfo{ExecutionInfo: info.ExecutionInfo, Tasks: tasks, PersistenceError: info.PersistenceError}) - } -} - -func (e *historyEngineImpl) NotifyNewReplicationTasks(info *hcommon.NotifyTaskInfo) { - for _, task := range info.Tasks { - hTask, err := hydrateReplicationTask(task, info.ExecutionInfo, info.VersionHistories, info.Activities, info.History) - if err != nil { - e.logger.Error("failed to preemptively hydrate replication task", tag.Error(err)) - continue - } - e.replicationTaskStore.Put(hTask) - } -} - -func hydrateReplicationTask( - task persistence.Task, - exec *persistence.WorkflowExecutionInfo, - versionHistories *persistence.VersionHistories, - activities map[int64]*persistence.ActivityInfo, - history events.PersistedBlobs, -) (*types.ReplicationTask, error) { - info := persistence.ReplicationTaskInfo{ - DomainID: exec.DomainID, - WorkflowID: exec.WorkflowID, - RunID: exec.RunID, - TaskType: task.GetType(), - CreationTime: task.GetVisibilityTimestamp().UnixNano(), - TaskID: task.GetTaskID(), - Version: task.GetVersion(), - } - - switch t := task.(type) { - case *persistence.HistoryReplicationTask: - info.BranchToken = t.BranchToken - info.NewRunBranchToken = t.NewRunBranchToken - info.FirstEventID = t.FirstEventID - info.NextEventID = t.NextEventID - case *persistence.SyncActivityTask: - info.ScheduledID = t.ScheduledID - case *persistence.FailoverMarkerTask: - // No specific fields, but supported - default: - return nil, errors.New("unknown replication task") - } - - hydrator := replication.NewImmediateTaskHydrator( - exec.IsRunning(), - versionHistories, - activities, - history.Find(info.BranchToken, info.FirstEventID), - history.Find(info.NewRunBranchToken, common.FirstEventID), - ) - - return hydrator.Hydrate(context.Background(), info) -} - -func (e *historyEngineImpl) ResetTransferQueue( - ctx context.Context, - clusterName string, -) error { - _, err := e.txProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) - return err -} - -func (e *historyEngineImpl) ResetTimerQueue( - ctx context.Context, - clusterName string, -) error { - _, err := e.timerProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) - return err -} - -func (e *historyEngineImpl) ResetCrossClusterQueue( - ctx context.Context, - clusterName string, -) error { - _, err := e.crossClusterProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) - return err -} - -func (e *historyEngineImpl) DescribeTransferQueue( - ctx context.Context, - clusterName string, -) (*types.DescribeQueueResponse, error) { - return e.describeQueue(ctx, e.txProcessor, clusterName) -} - -func (e *historyEngineImpl) DescribeTimerQueue( - ctx context.Context, - clusterName string, -) (*types.DescribeQueueResponse, error) { - return e.describeQueue(ctx, e.timerProcessor, clusterName) -} - -func (e *historyEngineImpl) DescribeCrossClusterQueue( - ctx context.Context, - clusterName string, -) (*types.DescribeQueueResponse, error) { - return e.describeQueue(ctx, e.crossClusterProcessor, clusterName) -} - -func (e *historyEngineImpl) describeQueue( - ctx context.Context, - queueProcessor queue.Processor, - clusterName string, -) (*types.DescribeQueueResponse, error) { - resp, err := queueProcessor.HandleAction(ctx, clusterName, queue.NewGetStateAction()) - if err != nil { - return nil, err - } - - serializedStates := make([]string, 0, len(resp.GetStateActionResult.States)) - for _, state := range resp.GetStateActionResult.States { - serializedStates = append(serializedStates, e.serializeQueueState(state)) - } - return &types.DescribeQueueResponse{ - ProcessingQueueStates: serializedStates, - }, nil -} - -func (e *historyEngineImpl) serializeQueueState( - state queue.ProcessingQueueState, -) string { - return fmt.Sprintf("%v", state) -} - -func (e *historyEngineImpl) validateStartWorkflowExecutionRequest( - request *types.StartWorkflowExecutionRequest, - metricsScope int, -) error { - - if len(request.GetRequestID()) == 0 { - return &types.BadRequestError{Message: "Missing request ID."} - } - if request.ExecutionStartToCloseTimeoutSeconds == nil || request.GetExecutionStartToCloseTimeoutSeconds() <= 0 { - return &types.BadRequestError{Message: "Missing or invalid ExecutionStartToCloseTimeoutSeconds."} - } - if request.TaskStartToCloseTimeoutSeconds == nil || request.GetTaskStartToCloseTimeoutSeconds() <= 0 { - return &types.BadRequestError{Message: "Missing or invalid TaskStartToCloseTimeoutSeconds."} - } - if request.TaskList == nil || request.TaskList.GetName() == "" { - return &types.BadRequestError{Message: "Missing Tasklist."} - } - if request.WorkflowType == nil || request.WorkflowType.GetName() == "" { - return &types.BadRequestError{Message: "Missing WorkflowType."} - } - - if !common.IsValidIDLength( - request.GetDomain(), - e.metricsClient.Scope(metricsScope), - e.config.MaxIDLengthWarnLimit(), - e.config.DomainNameMaxLength(request.GetDomain()), - metrics.CadenceErrDomainNameExceededWarnLimit, - request.GetDomain(), - e.logger, - tag.IDTypeDomainName) { - return &types.BadRequestError{Message: "Domain exceeds length limit."} - } - - if !common.IsValidIDLength( - request.GetWorkflowID(), - e.metricsClient.Scope(metricsScope), - e.config.MaxIDLengthWarnLimit(), - e.config.WorkflowIDMaxLength(request.GetDomain()), - metrics.CadenceErrWorkflowIDExceededWarnLimit, - request.GetDomain(), - e.logger, - tag.IDTypeWorkflowID) { - return &types.BadRequestError{Message: "WorkflowId exceeds length limit."} - } - if !common.IsValidIDLength( - request.TaskList.GetName(), - e.metricsClient.Scope(metricsScope), - e.config.MaxIDLengthWarnLimit(), - e.config.TaskListNameMaxLength(request.GetDomain()), - metrics.CadenceErrTaskListNameExceededWarnLimit, - request.GetDomain(), - e.logger, - tag.IDTypeTaskListName) { - return &types.BadRequestError{Message: "TaskList exceeds length limit."} - } - if !common.IsValidIDLength( - request.WorkflowType.GetName(), - e.metricsClient.Scope(metricsScope), - e.config.MaxIDLengthWarnLimit(), - e.config.WorkflowTypeMaxLength(request.GetDomain()), - metrics.CadenceErrWorkflowTypeExceededWarnLimit, - request.GetDomain(), - e.logger, - tag.IDTypeWorkflowType) { - return &types.BadRequestError{Message: "WorkflowType exceeds length limit."} - } - - return common.ValidateRetryPolicy(request.RetryPolicy) -} - -func (e *historyEngineImpl) overrideStartWorkflowExecutionRequest( - domainEntry *cache.DomainCacheEntry, - request *types.StartWorkflowExecutionRequest, - metricsScope int, -) { - - domainName := domainEntry.GetInfo().Name - maxDecisionStartToCloseTimeoutSeconds := int32(e.config.MaxDecisionStartToCloseSeconds(domainName)) - - taskStartToCloseTimeoutSecs := request.GetTaskStartToCloseTimeoutSeconds() - taskStartToCloseTimeoutSecs = common.MinInt32(taskStartToCloseTimeoutSecs, maxDecisionStartToCloseTimeoutSeconds) - taskStartToCloseTimeoutSecs = common.MinInt32(taskStartToCloseTimeoutSecs, request.GetExecutionStartToCloseTimeoutSeconds()) - - if taskStartToCloseTimeoutSecs != request.GetTaskStartToCloseTimeoutSeconds() { - request.TaskStartToCloseTimeoutSeconds = &taskStartToCloseTimeoutSecs - e.metricsClient.Scope( - metricsScope, - metrics.DomainTag(domainName), - ).IncCounter(metrics.DecisionStartToCloseTimeoutOverrideCount) - } -} - -func getScheduleID( - activityID string, - mutableState execution.MutableState, -) (int64, error) { - - if activityID == "" { - return 0, &types.BadRequestError{Message: "Neither ActivityID nor ScheduleID is provided"} - } - activityInfo, ok := mutableState.GetActivityByActivityID(activityID) - if !ok { - return 0, &types.BadRequestError{Message: "Cannot locate Activity ScheduleID"} - } - return activityInfo.ScheduleID, nil -} - -func getStartRequest( - domainID string, - request *types.SignalWithStartWorkflowExecutionRequest, - partitionConfig map[string]string, -) (*types.HistoryStartWorkflowExecutionRequest, error) { - - req := &types.StartWorkflowExecutionRequest{ - Domain: request.Domain, - WorkflowID: request.WorkflowID, - WorkflowType: request.WorkflowType, - TaskList: request.TaskList, - Input: request.Input, - ExecutionStartToCloseTimeoutSeconds: request.ExecutionStartToCloseTimeoutSeconds, - TaskStartToCloseTimeoutSeconds: request.TaskStartToCloseTimeoutSeconds, - Identity: request.Identity, - RequestID: request.RequestID, - WorkflowIDReusePolicy: request.WorkflowIDReusePolicy, - RetryPolicy: request.RetryPolicy, - CronSchedule: request.CronSchedule, - Memo: request.Memo, - SearchAttributes: request.SearchAttributes, - Header: request.Header, - DelayStartSeconds: request.DelayStartSeconds, - JitterStartSeconds: request.JitterStartSeconds, - } - - return common.CreateHistoryStartWorkflowRequest(domainID, req, time.Now(), partitionConfig) -} - -func (e *historyEngineImpl) applyWorkflowIDReusePolicyForSigWithStart( - prevExecutionInfo *persistence.WorkflowExecutionInfo, - execution types.WorkflowExecution, - wfIDReusePolicy types.WorkflowIDReusePolicy, -) error { - - prevStartRequestID := prevExecutionInfo.CreateRequestID - prevRunID := prevExecutionInfo.RunID - prevState := prevExecutionInfo.State - prevCloseState := prevExecutionInfo.CloseStatus - - return e.applyWorkflowIDReusePolicyHelper( - prevStartRequestID, - prevRunID, - prevState, - prevCloseState, - execution, - wfIDReusePolicy, - ) -} - -func (e *historyEngineImpl) applyWorkflowIDReusePolicyHelper( - prevStartRequestID, - prevRunID string, - prevState int, - prevCloseState int, - execution types.WorkflowExecution, - wfIDReusePolicy types.WorkflowIDReusePolicy, -) error { - - // here we know some information about the prev workflow, i.e. either running right now - // or has history check if the workflow is finished - switch prevState { - case persistence.WorkflowStateCreated, - persistence.WorkflowStateRunning: - msg := "Workflow execution is already running. WorkflowId: %v, RunId: %v." - return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) - case persistence.WorkflowStateCompleted: - // previous workflow completed, proceed - case persistence.WorkflowStateCorrupted: - // ignore workflow ID reuse policy for corrupted workflows, treat as they do not exist - return nil - default: - // persistence.WorkflowStateZombie or unknown type - return &types.InternalServiceError{Message: fmt.Sprintf("Failed to process workflow, workflow has invalid state: %v.", prevState)} - } - - switch wfIDReusePolicy { - case types.WorkflowIDReusePolicyAllowDuplicateFailedOnly: - if _, ok := FailedWorkflowCloseState[prevCloseState]; !ok { - msg := "Workflow execution already finished successfully. WorkflowId: %v, RunId: %v. Workflow ID reuse policy: allow duplicate workflow ID if last run failed." - return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) - } - case types.WorkflowIDReusePolicyAllowDuplicate, - types.WorkflowIDReusePolicyTerminateIfRunning: - // no check need here - case types.WorkflowIDReusePolicyRejectDuplicate: - msg := "Workflow execution already finished. WorkflowId: %v, RunId: %v. Workflow ID reuse policy: reject duplicate workflow ID." - return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) - default: - return &types.InternalServiceError{Message: "Failed to process start workflow reuse policy."} - } - - return nil -} - -func getWorkflowAlreadyStartedError(errMsg string, createRequestID string, workflowID string, runID string) error { - return &types.WorkflowExecutionAlreadyStartedError{ - Message: fmt.Sprintf(errMsg, workflowID, runID), - StartRequestID: createRequestID, - RunID: runID, - } -} - -func (e *historyEngineImpl) GetReplicationMessages( - ctx context.Context, - pollingCluster string, - lastReadMessageID int64, -) (*types.ReplicationMessages, error) { - - scope := metrics.HistoryGetReplicationMessagesScope - sw := e.metricsClient.StartTimer(scope, metrics.GetReplicationMessagesForShardLatency) - defer sw.Stop() - - replicationMessages, err := e.replicationAckManager.GetTasks( - ctx, - pollingCluster, - lastReadMessageID, - ) - if err != nil { - e.logger.Error("Failed to retrieve replication messages.", tag.Error(err)) - return nil, err - } - - // Set cluster status for sync shard info - replicationMessages.SyncShardStatus = &types.SyncShardStatus{ - Timestamp: common.Int64Ptr(e.timeSource.Now().UnixNano()), - } - e.logger.Debug("Successfully fetched replication messages.", tag.Counter(len(replicationMessages.ReplicationTasks))) - return replicationMessages, nil -} - -func (e *historyEngineImpl) GetDLQReplicationMessages( - ctx context.Context, - taskInfos []*types.ReplicationTaskInfo, -) ([]*types.ReplicationTask, error) { - - scope := metrics.HistoryGetDLQReplicationMessagesScope - sw := e.metricsClient.StartTimer(scope, metrics.GetDLQReplicationMessagesLatency) - defer sw.Stop() - - tasks := make([]*types.ReplicationTask, 0, len(taskInfos)) - for _, taskInfo := range taskInfos { - task, err := e.replicationHydrator.Hydrate(ctx, persistence.ReplicationTaskInfo{ - DomainID: taskInfo.DomainID, - WorkflowID: taskInfo.WorkflowID, - RunID: taskInfo.RunID, - TaskID: taskInfo.TaskID, - TaskType: int(taskInfo.TaskType), - FirstEventID: taskInfo.FirstEventID, - NextEventID: taskInfo.NextEventID, - Version: taskInfo.Version, - ScheduledID: taskInfo.ScheduledID, - }) - if err != nil { - e.logger.Error("Failed to fetch DLQ replication messages.", tag.Error(err)) - return nil, err - } - if task != nil { - tasks = append(tasks, task) - } - } - - return tasks, nil -} - -func (e *historyEngineImpl) ReapplyEvents( - ctx context.Context, - domainUUID string, - workflowID string, - runID string, - reapplyEvents []*types.HistoryEvent, -) error { - - domainEntry, err := e.getActiveDomainByID(domainUUID) - if err != nil { - switch { - case domainEntry != nil && domainEntry.IsDomainPendingActive(): - return nil - default: - return err - } - } - domainID := domainEntry.GetInfo().ID - // remove run id from the execution so that reapply events to the current run - currentExecution := types.WorkflowExecution{ - WorkflowID: workflowID, - } - - return workflow.UpdateWithActionFunc( - ctx, - e.executionCache, - domainID, - currentExecution, - e.timeSource.Now(), - func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { - // Filter out reapply event from the same cluster - toReapplyEvents := make([]*types.HistoryEvent, 0, len(reapplyEvents)) - lastWriteVersion, err := mutableState.GetLastWriteVersion() - if err != nil { - return nil, err - } - for _, event := range reapplyEvents { - if event.Version == lastWriteVersion { - // The reapply is from the same cluster. Ignoring. - continue - } - dedupResource := definition.NewEventReappliedID(runID, event.ID, event.Version) - if mutableState.IsResourceDuplicated(dedupResource) { - // already apply the signal - continue - } - toReapplyEvents = append(toReapplyEvents, event) - } - if len(toReapplyEvents) == 0 { - return &workflow.UpdateAction{ - Noop: true, - }, nil - } - - if !mutableState.IsWorkflowExecutionRunning() { - // need to reset target workflow (which is also the current workflow) - // to accept events to be reapplied - baseRunID := mutableState.GetExecutionInfo().RunID - resetRunID := uuid.New() - baseRebuildLastEventID := mutableState.GetPreviousStartedEventID() - - // TODO when https://github.com/uber/cadence/issues/2420 is finished, remove this block, - // since cannot reapply event to a finished workflow which had no decisions started - if baseRebuildLastEventID == common.EmptyEventID { - e.logger.Warn("cannot reapply event to a finished workflow", - tag.WorkflowDomainID(domainID), - tag.WorkflowID(currentExecution.GetWorkflowID()), - ) - e.metricsClient.IncCounter(metrics.HistoryReapplyEventsScope, metrics.EventReapplySkippedCount) - return &workflow.UpdateAction{Noop: true}, nil - } - - baseVersionHistories := mutableState.GetVersionHistories() - if baseVersionHistories == nil { - return nil, execution.ErrMissingVersionHistories - } - baseCurrentVersionHistory, err := baseVersionHistories.GetCurrentVersionHistory() - if err != nil { - return nil, err - } - baseRebuildLastEventVersion, err := baseCurrentVersionHistory.GetEventVersion(baseRebuildLastEventID) - if err != nil { - return nil, err - } - baseCurrentBranchToken := baseCurrentVersionHistory.GetBranchToken() - baseNextEventID := mutableState.GetNextEventID() - - if err = e.workflowResetter.ResetWorkflow( - ctx, - domainID, - workflowID, - baseRunID, - baseCurrentBranchToken, - baseRebuildLastEventID, - baseRebuildLastEventVersion, - baseNextEventID, - resetRunID, - uuid.New(), - execution.NewWorkflow( - ctx, - e.shard.GetClusterMetadata(), - wfContext, - mutableState, - execution.NoopReleaseFn, - ), - ndc.EventsReapplicationResetWorkflowReason, - toReapplyEvents, - false, - ); err != nil { - return nil, err - } - return &workflow.UpdateAction{ - Noop: true, - }, nil - } - - postActions := &workflow.UpdateAction{ - CreateDecision: true, - } - // Do not create decision task when the workflow is cron and the cron has not been started yet - if mutableState.GetExecutionInfo().CronSchedule != "" && !mutableState.HasProcessedOrPendingDecision() { - postActions.CreateDecision = false - } - reappliedEvents, err := e.eventsReapplier.ReapplyEvents( - ctx, - mutableState, - toReapplyEvents, - runID, - ) - if err != nil { - e.logger.Error("failed to re-apply stale events", tag.Error(err)) - return nil, &types.InternalServiceError{Message: "unable to re-apply stale events"} - } - if len(reappliedEvents) == 0 { - return &workflow.UpdateAction{ - Noop: true, - }, nil - } - return postActions, nil - }, - ) -} - -func (e *historyEngineImpl) CountDLQMessages(ctx context.Context, forceFetch bool) (map[string]int64, error) { - return e.replicationDLQHandler.GetMessageCount(ctx, forceFetch) -} - -func (e *historyEngineImpl) ReadDLQMessages( - ctx context.Context, - request *types.ReadDLQMessagesRequest, -) (*types.ReadDLQMessagesResponse, error) { - - tasks, taskInfo, token, err := e.replicationDLQHandler.ReadMessages( - ctx, - request.GetSourceCluster(), - request.GetInclusiveEndMessageID(), - int(request.GetMaximumPageSize()), - request.GetNextPageToken(), - ) - if err != nil { - return nil, err - } - return &types.ReadDLQMessagesResponse{ - Type: request.GetType().Ptr(), - ReplicationTasks: tasks, - ReplicationTasksInfo: taskInfo, - NextPageToken: token, - }, nil -} - -func (e *historyEngineImpl) PurgeDLQMessages( - ctx context.Context, - request *types.PurgeDLQMessagesRequest, -) error { - - return e.replicationDLQHandler.PurgeMessages( - ctx, - request.GetSourceCluster(), - request.GetInclusiveEndMessageID(), - ) -} - -func (e *historyEngineImpl) MergeDLQMessages( - ctx context.Context, - request *types.MergeDLQMessagesRequest, -) (*types.MergeDLQMessagesResponse, error) { - - token, err := e.replicationDLQHandler.MergeMessages( - ctx, - request.GetSourceCluster(), - request.GetInclusiveEndMessageID(), - int(request.GetMaximumPageSize()), - request.GetNextPageToken(), - ) - if err != nil { - return nil, err - } - return &types.MergeDLQMessagesResponse{ - NextPageToken: token, - }, nil -} - -func (e *historyEngineImpl) RefreshWorkflowTasks( - ctx context.Context, - domainUUID string, - workflowExecution types.WorkflowExecution, -) (retError error) { - domainEntry, err := e.shard.GetDomainCache().GetDomainByID(domainUUID) - if err != nil { - return err - } - domainID := domainEntry.GetInfo().ID - - wfContext, release, err := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, workflowExecution) - if err != nil { - return err - } - defer func() { release(retError) }() - - mutableState, err := wfContext.LoadWorkflowExecution(ctx) - if err != nil { - return err - } - - mutableStateTaskRefresher := execution.NewMutableStateTaskRefresher( - e.shard.GetConfig(), - e.shard.GetClusterMetadata(), - e.shard.GetDomainCache(), - e.shard.GetEventsCache(), - e.shard.GetShardID(), - ) - - err = mutableStateTaskRefresher.RefreshTasks(ctx, mutableState.GetExecutionInfo().StartTimestamp, mutableState) - if err != nil { - return err - } - - err = wfContext.UpdateWorkflowExecutionTasks(ctx, e.shard.GetTimeSource().Now()) - if err != nil { - return err - } - return nil -} - -func (e *historyEngineImpl) GetCrossClusterTasks( - ctx context.Context, - targetCluster string, -) ([]*types.CrossClusterTaskRequest, error) { - actionResult, err := e.crossClusterProcessor.HandleAction(ctx, targetCluster, queue.NewGetTasksAction()) - if err != nil { - return nil, err - } - - return actionResult.GetTasksResult.TaskRequests, nil -} - -func (e *historyEngineImpl) RespondCrossClusterTasksCompleted( - ctx context.Context, - targetCluster string, - responses []*types.CrossClusterTaskResponse, -) error { - _, err := e.crossClusterProcessor.HandleAction(ctx, targetCluster, queue.NewUpdateTasksAction(responses)) - return err -} - -func (e *historyEngineImpl) newChildContext( - parentCtx context.Context, -) (context.Context, context.CancelFunc) { - - ctxTimeout := contextLockTimeout - if deadline, ok := parentCtx.Deadline(); ok { - now := e.shard.GetTimeSource().Now() - parentTimeout := deadline.Sub(now) - if parentTimeout > 0 && parentTimeout < contextLockTimeout { - ctxTimeout = parentTimeout - } - } - return context.WithTimeout(context.Background(), ctxTimeout) -} - -func (e *historyEngineImpl) getActiveDomainByID(id string) (*cache.DomainCacheEntry, error) { - return cache.GetActiveDomainByID(e.shard.GetDomainCache(), e.clusterMetadata.GetCurrentClusterName(), id) -} diff --git a/service/history/engine/engineimpl/history_engine.go b/service/history/engine/engineimpl/history_engine.go new file mode 100644 index 00000000000..b15b3f79790 --- /dev/null +++ b/service/history/engine/engineimpl/history_engine.go @@ -0,0 +1,515 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "errors" + "time" + + "go.uber.org/cadence/.gen/go/cadence/workflowserviceclient" + + "github.com/uber/cadence/client/matching" + "github.com/uber/cadence/client/wrappers/retryable" + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/cache" + "github.com/uber/cadence/common/client" + "github.com/uber/cadence/common/clock" + "github.com/uber/cadence/common/cluster" + "github.com/uber/cadence/common/dynamicconfig" + ce "github.com/uber/cadence/common/errors" + "github.com/uber/cadence/common/log" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + cndc "github.com/uber/cadence/common/ndc" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/quotas" + "github.com/uber/cadence/common/reconciliation/invariant" + "github.com/uber/cadence/common/service" + "github.com/uber/cadence/common/types" + hcommon "github.com/uber/cadence/service/history/common" + "github.com/uber/cadence/service/history/config" + "github.com/uber/cadence/service/history/decision" + "github.com/uber/cadence/service/history/engine" + "github.com/uber/cadence/service/history/events" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/failover" + "github.com/uber/cadence/service/history/ndc" + "github.com/uber/cadence/service/history/queue" + "github.com/uber/cadence/service/history/replication" + "github.com/uber/cadence/service/history/reset" + "github.com/uber/cadence/service/history/shard" + "github.com/uber/cadence/service/history/task" + "github.com/uber/cadence/service/history/workflow" + "github.com/uber/cadence/service/history/workflowcache" + warchiver "github.com/uber/cadence/service/worker/archiver" +) + +const ( + defaultQueryFirstDecisionTaskWaitTime = time.Second + queryFirstDecisionTaskCheckInterval = 200 * time.Millisecond + contextLockTimeout = 500 * time.Millisecond + longPollCompletionBuffer = 50 * time.Millisecond + + // TerminateIfRunningReason reason for terminateIfRunning + TerminateIfRunningReason = "TerminateIfRunning Policy" + // TerminateIfRunningDetailsTemplate details template for terminateIfRunning + TerminateIfRunningDetailsTemplate = "New runID: %s" +) + +var ( + errDomainDeprecated = &types.BadRequestError{Message: "Domain is deprecated."} +) + +type ( + historyEngineImpl struct { + currentClusterName string + shard shard.Context + timeSource clock.TimeSource + decisionHandler decision.Handler + clusterMetadata cluster.Metadata + historyV2Mgr persistence.HistoryManager + executionManager persistence.ExecutionManager + visibilityMgr persistence.VisibilityManager + txProcessor queue.Processor + timerProcessor queue.Processor + crossClusterProcessor queue.Processor + nDCReplicator ndc.HistoryReplicator + nDCActivityReplicator ndc.ActivityReplicator + historyEventNotifier events.Notifier + tokenSerializer common.TaskTokenSerializer + executionCache *execution.Cache + metricsClient metrics.Client + logger log.Logger + throttledLogger log.Logger + config *config.Config + archivalClient warchiver.Client + workflowResetter reset.WorkflowResetter + queueTaskProcessor task.Processor + crossClusterTaskProcessors common.Daemon + replicationTaskProcessors []replication.TaskProcessor + replicationAckManager replication.TaskAckManager + replicationTaskStore *replication.TaskStore + replicationHydrator replication.TaskHydrator + replicationMetricsEmitter *replication.MetricsEmitterImpl + publicClient workflowserviceclient.Interface + eventsReapplier ndc.EventsReapplier + matchingClient matching.Client + rawMatchingClient matching.Client + clientChecker client.VersionChecker + replicationDLQHandler replication.DLQHandler + failoverMarkerNotifier failover.MarkerNotifier + wfIDCache workflowcache.WFCache + ratelimitInternalPerWorkflowID dynamicconfig.BoolPropertyFnWithDomainFilter + + updateWithActionFn func(context.Context, *execution.Cache, string, types.WorkflowExecution, bool, time.Time, func(wfContext execution.Context, mutableState execution.MutableState) error) error + } +) + +var _ engine.Engine = (*historyEngineImpl)(nil) + +var ( + // FailedWorkflowCloseState is a set of failed workflow close states, used for start workflow policy + // for start workflow execution API + FailedWorkflowCloseState = map[int]bool{ + persistence.WorkflowCloseStatusFailed: true, + persistence.WorkflowCloseStatusCanceled: true, + persistence.WorkflowCloseStatusTerminated: true, + persistence.WorkflowCloseStatusTimedOut: true, + } +) + +// NewEngineWithShardContext creates an instance of history engine +func NewEngineWithShardContext( + shard shard.Context, + visibilityMgr persistence.VisibilityManager, + matching matching.Client, + publicClient workflowserviceclient.Interface, + historyEventNotifier events.Notifier, + config *config.Config, + crossClusterTaskFetchers task.Fetchers, + replicationTaskFetchers replication.TaskFetchers, + rawMatchingClient matching.Client, + queueTaskProcessor task.Processor, + failoverCoordinator failover.Coordinator, + wfIDCache workflowcache.WFCache, + ratelimitInternalPerWorkflowID dynamicconfig.BoolPropertyFnWithDomainFilter, +) engine.Engine { + currentClusterName := shard.GetService().GetClusterMetadata().GetCurrentClusterName() + + logger := shard.GetLogger() + executionManager := shard.GetExecutionManager() + historyV2Manager := shard.GetHistoryManager() + executionCache := execution.NewCache(shard) + failoverMarkerNotifier := failover.NewMarkerNotifier(shard, config, failoverCoordinator) + replicationHydrator := replication.NewDeferredTaskHydrator(shard.GetShardID(), historyV2Manager, executionCache, shard.GetDomainCache()) + replicationTaskStore := replication.NewTaskStore( + shard.GetConfig(), + shard.GetClusterMetadata(), + shard.GetDomainCache(), + shard.GetMetricsClient(), + shard.GetLogger(), + replicationHydrator, + ) + replicationReader := replication.NewDynamicTaskReader(shard.GetShardID(), executionManager, shard.GetTimeSource(), config) + + historyEngImpl := &historyEngineImpl{ + currentClusterName: currentClusterName, + shard: shard, + clusterMetadata: shard.GetClusterMetadata(), + timeSource: shard.GetTimeSource(), + historyV2Mgr: historyV2Manager, + executionManager: executionManager, + visibilityMgr: visibilityMgr, + tokenSerializer: common.NewJSONTaskTokenSerializer(), + executionCache: executionCache, + logger: logger.WithTags(tag.ComponentHistoryEngine), + throttledLogger: shard.GetThrottledLogger().WithTags(tag.ComponentHistoryEngine), + metricsClient: shard.GetMetricsClient(), + historyEventNotifier: historyEventNotifier, + config: config, + archivalClient: warchiver.NewClient( + shard.GetMetricsClient(), + logger, + publicClient, + shard.GetConfig().NumArchiveSystemWorkflows, + quotas.NewDynamicRateLimiter(config.ArchiveRequestRPS.AsFloat64()), + quotas.NewDynamicRateLimiter(func() float64 { + return quotas.PerMember( + service.History, + float64(config.ArchiveInlineHistoryGlobalRPS()), + float64(config.ArchiveInlineHistoryRPS()), + shard.GetService().GetMembershipResolver(), + ) + }), + quotas.NewDynamicRateLimiter(func() float64 { + return quotas.PerMember( + service.History, + float64(config.ArchiveInlineVisibilityGlobalRPS()), + float64(config.ArchiveInlineVisibilityRPS()), + shard.GetService().GetMembershipResolver(), + ) + }), + shard.GetService().GetArchiverProvider(), + config.AllowArchivingIncompleteHistory, + ), + workflowResetter: reset.NewWorkflowResetter( + shard, + executionCache, + logger, + ), + publicClient: publicClient, + matchingClient: matching, + rawMatchingClient: rawMatchingClient, + queueTaskProcessor: queueTaskProcessor, + clientChecker: client.NewVersionChecker(), + failoverMarkerNotifier: failoverMarkerNotifier, + replicationHydrator: replicationHydrator, + replicationAckManager: replication.NewTaskAckManager( + shard.GetShardID(), + shard, + shard.GetMetricsClient(), + shard.GetLogger(), + replicationReader, + replicationTaskStore, + ), + replicationTaskStore: replicationTaskStore, + replicationMetricsEmitter: replication.NewMetricsEmitter( + shard.GetShardID(), shard, replicationReader, shard.GetMetricsClient()), + wfIDCache: wfIDCache, + ratelimitInternalPerWorkflowID: ratelimitInternalPerWorkflowID, + updateWithActionFn: workflow.UpdateWithAction, + } + historyEngImpl.decisionHandler = decision.NewHandler( + shard, + historyEngImpl.executionCache, + historyEngImpl.tokenSerializer, + ) + pRetry := persistence.NewPersistenceRetryer( + shard.GetExecutionManager(), + shard.GetHistoryManager(), + common.CreatePersistenceRetryPolicy(), + ) + openExecutionCheck := invariant.NewConcreteExecutionExists(pRetry, shard.GetDomainCache()) + + historyEngImpl.txProcessor = queue.NewTransferQueueProcessor( + shard, + historyEngImpl, + queueTaskProcessor, + executionCache, + historyEngImpl.workflowResetter, + historyEngImpl.archivalClient, + openExecutionCheck, + historyEngImpl.wfIDCache, + historyEngImpl.ratelimitInternalPerWorkflowID, + ) + + historyEngImpl.timerProcessor = queue.NewTimerQueueProcessor( + shard, + historyEngImpl, + queueTaskProcessor, + executionCache, + historyEngImpl.archivalClient, + openExecutionCheck, + ) + + historyEngImpl.crossClusterProcessor = queue.NewCrossClusterQueueProcessor( + shard, + historyEngImpl, + executionCache, + queueTaskProcessor, + ) + + historyEngImpl.eventsReapplier = ndc.NewEventsReapplier(shard.GetMetricsClient(), logger) + + historyEngImpl.nDCReplicator = ndc.NewHistoryReplicator( + shard, + executionCache, + historyEngImpl.eventsReapplier, + logger, + ) + historyEngImpl.nDCActivityReplicator = ndc.NewActivityReplicator( + shard, + executionCache, + logger, + ) + + historyEngImpl.crossClusterTaskProcessors = task.NewCrossClusterTaskProcessors( + shard, + queueTaskProcessor, + crossClusterTaskFetchers, + &task.CrossClusterTaskProcessorOptions{ + Enabled: config.EnableCrossClusterEngine, + MaxPendingTasks: config.CrossClusterTargetProcessorMaxPendingTasks, + TaskMaxRetryCount: config.CrossClusterTargetProcessorMaxRetryCount, + TaskRedispatchInterval: config.ActiveTaskRedispatchInterval, + TaskWaitInterval: config.CrossClusterTargetProcessorTaskWaitInterval, + ServiceBusyBackoffInterval: config.CrossClusterTargetProcessorServiceBusyBackoffInterval, + TimerJitterCoefficient: config.CrossClusterTargetProcessorJitterCoefficient, + }, + ) + + var replicationTaskProcessors []replication.TaskProcessor + replicationTaskExecutors := make(map[string]replication.TaskExecutor) + // Intentionally use the raw client to create its own retry policy + historyRawClient := shard.GetService().GetClientBean().GetHistoryClient() + historyRetryableClient := retryable.NewHistoryClient( + historyRawClient, + common.CreateReplicationServiceBusyRetryPolicy(), + common.IsServiceBusyError, + ) + resendFunc := func(ctx context.Context, request *types.ReplicateEventsV2Request) error { + return historyRetryableClient.ReplicateEventsV2(ctx, request) + } + for _, replicationTaskFetcher := range replicationTaskFetchers.GetFetchers() { + sourceCluster := replicationTaskFetcher.GetSourceCluster() + // Intentionally use the raw client to create its own retry policy + adminClient := shard.GetService().GetClientBean().GetRemoteAdminClient(sourceCluster) + adminRetryableClient := retryable.NewAdminClient( + adminClient, + common.CreateReplicationServiceBusyRetryPolicy(), + common.IsServiceBusyError, + ) + historyResender := cndc.NewHistoryResender( + shard.GetDomainCache(), + adminRetryableClient, + resendFunc, + nil, + openExecutionCheck, + shard.GetLogger(), + ) + replicationTaskExecutor := replication.NewTaskExecutor( + shard, + shard.GetDomainCache(), + historyResender, + historyEngImpl, + shard.GetMetricsClient(), + shard.GetLogger(), + ) + replicationTaskExecutors[sourceCluster] = replicationTaskExecutor + + replicationTaskProcessor := replication.NewTaskProcessor( + shard, + historyEngImpl, + config, + shard.GetMetricsClient(), + replicationTaskFetcher, + replicationTaskExecutor, + ) + replicationTaskProcessors = append(replicationTaskProcessors, replicationTaskProcessor) + } + historyEngImpl.replicationTaskProcessors = replicationTaskProcessors + replicationMessageHandler := replication.NewDLQHandler(shard, replicationTaskExecutors) + historyEngImpl.replicationDLQHandler = replicationMessageHandler + + shard.SetEngine(historyEngImpl) + return historyEngImpl +} + +// Start will spin up all the components needed to start serving this shard. +// Make sure all the components are loaded lazily so start can return immediately. This is important because +// ShardController calls start sequentially for all the shards for a given host during startup. +func (e *historyEngineImpl) Start() { + e.logger.Info("History engine state changed", tag.LifeCycleStarting) + defer e.logger.Info("History engine state changed", tag.LifeCycleStarted) + + e.txProcessor.Start() + e.timerProcessor.Start() + e.crossClusterProcessor.Start() + e.replicationDLQHandler.Start() + e.replicationMetricsEmitter.Start() + + // failover callback will try to create a failover queue processor to scan all inflight tasks + // if domain needs to be failovered. However, in the multicursor queue logic, the scan range + // can't be retrieved before the processor is started. If failover callback is registered + // before queue processor is started, it may result in a deadline as to create the failover queue, + // queue processor need to be started. + e.registerDomainFailoverCallback() + + e.crossClusterTaskProcessors.Start() + + for _, replicationTaskProcessor := range e.replicationTaskProcessors { + replicationTaskProcessor.Start() + } + if e.config.EnableGracefulFailover() { + e.failoverMarkerNotifier.Start() + } + +} + +// Stop the service. +func (e *historyEngineImpl) Stop() { + e.logger.Info("History engine state changed", tag.LifeCycleStopping) + defer e.logger.Info("History engine state changed", tag.LifeCycleStopped) + + e.txProcessor.Stop() + e.timerProcessor.Stop() + e.crossClusterProcessor.Stop() + e.replicationDLQHandler.Stop() + e.replicationMetricsEmitter.Stop() + + e.crossClusterTaskProcessors.Stop() + + for _, replicationTaskProcessor := range e.replicationTaskProcessors { + replicationTaskProcessor.Stop() + } + + if e.queueTaskProcessor != nil { + e.queueTaskProcessor.StopShardProcessor(e.shard) + } + + e.failoverMarkerNotifier.Stop() + + // unset the failover callback + e.shard.GetDomainCache().UnregisterDomainChangeCallback(e.shard.GetShardID()) +} + +// ScheduleDecisionTask schedules a decision if no outstanding decision found +func (e *historyEngineImpl) ScheduleDecisionTask(ctx context.Context, req *types.ScheduleDecisionTaskRequest) error { + return e.decisionHandler.HandleDecisionTaskScheduled(ctx, req) +} + +func (e *historyEngineImpl) ReplicateEventsV2(ctx context.Context, replicateRequest *types.ReplicateEventsV2Request) error { + return e.nDCReplicator.ApplyEvents(ctx, replicateRequest) +} + +func (e *historyEngineImpl) SyncShardStatus(ctx context.Context, request *types.SyncShardStatusRequest) error { + + clusterName := request.GetSourceCluster() + now := time.Unix(0, request.GetTimestamp()) + + // here there are 3 main things + // 1. update the view of remote cluster's shard time + // 2. notify the timer gate in the timer queue standby processor + // 3. notify the transfer (essentially a no op, just put it here so it looks symmetric) + // 4. notify the cross cluster (essentially a no op, just put it here so it looks symmetric) + e.shard.SetCurrentTime(clusterName, now) + e.txProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) + e.timerProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) + e.crossClusterProcessor.NotifyNewTask(clusterName, &hcommon.NotifyTaskInfo{Tasks: []persistence.Task{}}) + return nil +} + +func (e *historyEngineImpl) SyncActivity(ctx context.Context, request *types.SyncActivityRequest) (retError error) { + + return e.nDCActivityReplicator.SyncActivity(ctx, request) +} + +func (e *historyEngineImpl) newDomainNotActiveError( + domainName string, + failoverVersion int64, +) error { + clusterMetadata := e.shard.GetService().GetClusterMetadata() + clusterName, err := clusterMetadata.ClusterNameForFailoverVersion(failoverVersion) + if err != nil { + clusterName = "_unknown_" + } + return ce.NewDomainNotActiveError( + domainName, + clusterMetadata.GetCurrentClusterName(), + clusterName, + ) +} + +func (e *historyEngineImpl) checkForHistoryCorruptions(ctx context.Context, mutableState execution.MutableState) (bool, error) { + domainName := mutableState.GetDomainEntry().GetInfo().Name + if !e.config.EnableHistoryCorruptionCheck(domainName) { + return false, nil + } + + // Ensure that we can obtain start event. Failing to do so means corrupted history or resurrected mutable state record. + _, err := mutableState.GetStartEvent(ctx) + if err != nil { + info := mutableState.GetExecutionInfo() + // Mark workflow as corrupted. So that new one can be restarted. + info.State = persistence.WorkflowStateCorrupted + + e.logger.Error("history corruption check failed", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(info.WorkflowID), + tag.WorkflowRunID(info.RunID), + tag.WorkflowType(info.WorkflowTypeName), + tag.Error(err)) + + if errors.Is(err, execution.ErrMissingWorkflowStartEvent) { + return true, nil + } + return false, err + } + + return false, nil +} + +func getScheduleID(activityID string, mutableState execution.MutableState) (int64, error) { + if activityID == "" { + return 0, &types.BadRequestError{Message: "Neither ActivityID nor ScheduleID is provided"} + } + activityInfo, ok := mutableState.GetActivityByActivityID(activityID) + if !ok { + return 0, &types.BadRequestError{Message: "Cannot locate Activity ScheduleID"} + } + return activityInfo.ScheduleID, nil +} + +func (e *historyEngineImpl) getActiveDomainByID(id string) (*cache.DomainCacheEntry, error) { + return cache.GetActiveDomainByID(e.shard.GetDomainCache(), e.clusterMetadata.GetCurrentClusterName(), id) +} diff --git a/service/history/engine/engineimpl/historyEngine2_test.go b/service/history/engine/engineimpl/history_engine2_test.go similarity index 100% rename from service/history/engine/engineimpl/historyEngine2_test.go rename to service/history/engine/engineimpl/history_engine2_test.go diff --git a/service/history/engine/engineimpl/historyEngine3_eventsv2_test.go b/service/history/engine/engineimpl/history_engine3_eventsv2_test.go similarity index 100% rename from service/history/engine/engineimpl/historyEngine3_eventsv2_test.go rename to service/history/engine/engineimpl/history_engine3_eventsv2_test.go diff --git a/service/history/engine/engineimpl/historyEngine_test.go b/service/history/engine/engineimpl/history_engine_test.go similarity index 100% rename from service/history/engine/engineimpl/historyEngine_test.go rename to service/history/engine/engineimpl/history_engine_test.go diff --git a/service/history/engine/engineimpl/notify_tasks.go b/service/history/engine/engineimpl/notify_tasks.go new file mode 100644 index 00000000000..b0c5220315b --- /dev/null +++ b/service/history/engine/engineimpl/notify_tasks.go @@ -0,0 +1,144 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "errors" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + hcommon "github.com/uber/cadence/service/history/common" + "github.com/uber/cadence/service/history/events" + "github.com/uber/cadence/service/history/replication" +) + +func (e *historyEngineImpl) NotifyNewHistoryEvent(event *events.Notification) { + e.historyEventNotifier.NotifyNewHistoryEvent(event) +} + +func (e *historyEngineImpl) NotifyNewTransferTasks(info *hcommon.NotifyTaskInfo) { + if len(info.Tasks) == 0 { + return + } + + task := info.Tasks[0] + clusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(task.GetVersion()) + if err == nil { + e.txProcessor.NotifyNewTask(clusterName, info) + } +} + +func (e *historyEngineImpl) NotifyNewTimerTasks(info *hcommon.NotifyTaskInfo) { + if len(info.Tasks) == 0 { + return + } + + task := info.Tasks[0] + clusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(task.GetVersion()) + if err == nil { + e.timerProcessor.NotifyNewTask(clusterName, info) + } +} + +func (e *historyEngineImpl) NotifyNewCrossClusterTasks(info *hcommon.NotifyTaskInfo) { + taskByTargetCluster := make(map[string][]persistence.Task) + for _, task := range info.Tasks { + // TODO: consider defining a new interface in persistence package + // for cross cluster tasks and add a method for returning the target cluster + var targetCluster string + switch crossClusterTask := task.(type) { + case *persistence.CrossClusterStartChildExecutionTask: + targetCluster = crossClusterTask.TargetCluster + case *persistence.CrossClusterCancelExecutionTask: + targetCluster = crossClusterTask.TargetCluster + case *persistence.CrossClusterSignalExecutionTask: + targetCluster = crossClusterTask.TargetCluster + case *persistence.CrossClusterRecordChildExecutionCompletedTask: + targetCluster = crossClusterTask.TargetCluster + case *persistence.CrossClusterApplyParentClosePolicyTask: + targetCluster = crossClusterTask.TargetCluster + default: + panic("encountered unknown cross cluster task type") + } + taskByTargetCluster[targetCluster] = append(taskByTargetCluster[targetCluster], task) + } + + for targetCluster, tasks := range taskByTargetCluster { + e.crossClusterProcessor.NotifyNewTask(targetCluster, &hcommon.NotifyTaskInfo{ExecutionInfo: info.ExecutionInfo, Tasks: tasks, PersistenceError: info.PersistenceError}) + } +} + +func (e *historyEngineImpl) NotifyNewReplicationTasks(info *hcommon.NotifyTaskInfo) { + for _, task := range info.Tasks { + hTask, err := hydrateReplicationTask(task, info.ExecutionInfo, info.VersionHistories, info.Activities, info.History) + if err != nil { + e.logger.Error("failed to preemptively hydrate replication task", tag.Error(err)) + continue + } + e.replicationTaskStore.Put(hTask) + } +} + +func hydrateReplicationTask( + task persistence.Task, + exec *persistence.WorkflowExecutionInfo, + versionHistories *persistence.VersionHistories, + activities map[int64]*persistence.ActivityInfo, + history events.PersistedBlobs, +) (*types.ReplicationTask, error) { + info := persistence.ReplicationTaskInfo{ + DomainID: exec.DomainID, + WorkflowID: exec.WorkflowID, + RunID: exec.RunID, + TaskType: task.GetType(), + CreationTime: task.GetVisibilityTimestamp().UnixNano(), + TaskID: task.GetTaskID(), + Version: task.GetVersion(), + } + + switch t := task.(type) { + case *persistence.HistoryReplicationTask: + info.BranchToken = t.BranchToken + info.NewRunBranchToken = t.NewRunBranchToken + info.FirstEventID = t.FirstEventID + info.NextEventID = t.NextEventID + case *persistence.SyncActivityTask: + info.ScheduledID = t.ScheduledID + case *persistence.FailoverMarkerTask: + // No specific fields, but supported + default: + return nil, errors.New("unknown replication task") + } + + hydrator := replication.NewImmediateTaskHydrator( + exec.IsRunning(), + versionHistories, + activities, + history.Find(info.BranchToken, info.FirstEventID), + history.Find(info.NewRunBranchToken, common.FirstEventID), + ) + + return hydrator.Hydrate(context.Background(), info) +} diff --git a/service/history/engine/engineimpl/poll_mutable_state.go b/service/history/engine/engineimpl/poll_mutable_state.go new file mode 100644 index 00000000000..535da71ea96 --- /dev/null +++ b/service/history/engine/engineimpl/poll_mutable_state.go @@ -0,0 +1,248 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "bytes" + "context" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/definition" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" +) + +// GetMutableState retrieves the mutable state of the workflow execution +func (e *historyEngineImpl) GetMutableState(ctx context.Context, request *types.GetMutableStateRequest) (*types.GetMutableStateResponse, error) { + return e.getMutableStateOrPolling(ctx, request) +} + +// PollMutableState retrieves the mutable state of the workflow execution with long polling +func (e *historyEngineImpl) PollMutableState(ctx context.Context, request *types.PollMutableStateRequest) (*types.PollMutableStateResponse, error) { + response, err := e.getMutableStateOrPolling(ctx, &types.GetMutableStateRequest{ + DomainUUID: request.DomainUUID, + Execution: request.Execution, + ExpectedNextEventID: request.ExpectedNextEventID, + CurrentBranchToken: request.CurrentBranchToken}) + + if err != nil { + return nil, e.updateEntityNotExistsErrorOnPassiveCluster(err, request.GetDomainUUID()) + } + + return &types.PollMutableStateResponse{ + Execution: response.Execution, + WorkflowType: response.WorkflowType, + NextEventID: response.NextEventID, + PreviousStartedEventID: response.PreviousStartedEventID, + LastFirstEventID: response.LastFirstEventID, + TaskList: response.TaskList, + StickyTaskList: response.StickyTaskList, + ClientLibraryVersion: response.ClientLibraryVersion, + ClientFeatureVersion: response.ClientFeatureVersion, + ClientImpl: response.ClientImpl, + StickyTaskListScheduleToStartTimeout: response.StickyTaskListScheduleToStartTimeout, + CurrentBranchToken: response.CurrentBranchToken, + VersionHistories: response.VersionHistories, + WorkflowState: response.WorkflowState, + WorkflowCloseState: response.WorkflowCloseState, + }, nil +} + +func (e *historyEngineImpl) getMutableState( + ctx context.Context, + domainID string, + execution types.WorkflowExecution, +) (retResp *types.GetMutableStateResponse, retError error) { + + wfContext, release, retError := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, execution) + if retError != nil { + return + } + defer func() { release(retError) }() + + mutableState, retError := wfContext.LoadWorkflowExecution(ctx) + if retError != nil { + return + } + + currentBranchToken, err := mutableState.GetCurrentBranchToken() + if err != nil { + return nil, err + } + + executionInfo := mutableState.GetExecutionInfo() + execution.RunID = wfContext.GetExecution().RunID + workflowState, workflowCloseState := mutableState.GetWorkflowStateCloseStatus() + retResp = &types.GetMutableStateResponse{ + Execution: &execution, + WorkflowType: &types.WorkflowType{Name: executionInfo.WorkflowTypeName}, + LastFirstEventID: mutableState.GetLastFirstEventID(), + NextEventID: mutableState.GetNextEventID(), + PreviousStartedEventID: common.Int64Ptr(mutableState.GetPreviousStartedEventID()), + TaskList: &types.TaskList{Name: executionInfo.TaskList}, + StickyTaskList: &types.TaskList{Name: executionInfo.StickyTaskList, Kind: types.TaskListKindSticky.Ptr()}, + ClientLibraryVersion: executionInfo.ClientLibraryVersion, + ClientFeatureVersion: executionInfo.ClientFeatureVersion, + ClientImpl: executionInfo.ClientImpl, + IsWorkflowRunning: mutableState.IsWorkflowExecutionRunning(), + StickyTaskListScheduleToStartTimeout: common.Int32Ptr(executionInfo.StickyScheduleToStartTimeout), + CurrentBranchToken: currentBranchToken, + WorkflowState: common.Int32Ptr(int32(workflowState)), + WorkflowCloseState: common.Int32Ptr(int32(workflowCloseState)), + IsStickyTaskListEnabled: mutableState.IsStickyTaskListEnabled(), + HistorySize: mutableState.GetHistorySize(), + } + versionHistories := mutableState.GetVersionHistories() + if versionHistories != nil { + retResp.VersionHistories = versionHistories.ToInternalType() + } + return +} + +func (e *historyEngineImpl) updateEntityNotExistsErrorOnPassiveCluster(err error, domainID string) error { + switch err.(type) { + case *types.EntityNotExistsError: + domainEntry, domainCacheErr := e.shard.GetDomainCache().GetDomainByID(domainID) + if domainCacheErr != nil { + return err // if could not access domain cache simply return original error + } + + if _, domainNotActiveErr := domainEntry.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()); domainNotActiveErr != nil { + domainNotActiveErrCasted := domainNotActiveErr.(*types.DomainNotActiveError) + return &types.EntityNotExistsError{ + Message: "Workflow execution not found in non-active cluster", + ActiveCluster: domainNotActiveErrCasted.GetActiveCluster(), + CurrentCluster: domainNotActiveErrCasted.GetCurrentCluster(), + } + } + } + return err +} + +func (e *historyEngineImpl) getMutableStateOrPolling( + ctx context.Context, + request *types.GetMutableStateRequest, +) (*types.GetMutableStateResponse, error) { + + if err := common.ValidateDomainUUID(request.DomainUUID); err != nil { + return nil, err + } + domainID := request.DomainUUID + execution := types.WorkflowExecution{ + WorkflowID: request.Execution.WorkflowID, + RunID: request.Execution.RunID, + } + response, err := e.getMutableState(ctx, domainID, execution) + if err != nil { + return nil, err + } + if request.CurrentBranchToken == nil { + request.CurrentBranchToken = response.CurrentBranchToken + } + if !bytes.Equal(request.CurrentBranchToken, response.CurrentBranchToken) { + return nil, &types.CurrentBranchChangedError{ + Message: "current branch token and request branch token doesn't match", + CurrentBranchToken: response.CurrentBranchToken} + } + // set the run id in case query the current running workflow + execution.RunID = response.Execution.RunID + + // expectedNextEventID is 0 when caller want to get the current next event ID without blocking + expectedNextEventID := common.FirstEventID + if request.ExpectedNextEventID != 0 { + expectedNextEventID = request.GetExpectedNextEventID() + } + + // if caller decide to long poll on workflow execution + // and the event ID we are looking for is smaller than current next event ID + if expectedNextEventID >= response.GetNextEventID() && response.GetIsWorkflowRunning() { + subscriberID, channel, err := e.historyEventNotifier.WatchHistoryEvent(definition.NewWorkflowIdentifier(domainID, execution.GetWorkflowID(), execution.GetRunID())) + if err != nil { + return nil, err + } + defer e.historyEventNotifier.UnwatchHistoryEvent(definition.NewWorkflowIdentifier(domainID, execution.GetWorkflowID(), execution.GetRunID()), subscriberID) //nolint:errcheck + // check again in case the next event ID is updated + response, err = e.getMutableState(ctx, domainID, execution) + if err != nil { + return nil, err + } + // check again if the current branch token changed + if !bytes.Equal(request.CurrentBranchToken, response.CurrentBranchToken) { + return nil, &types.CurrentBranchChangedError{ + Message: "current branch token and request branch token doesn't match", + CurrentBranchToken: response.CurrentBranchToken} + } + if expectedNextEventID < response.GetNextEventID() || !response.GetIsWorkflowRunning() { + return response, nil + } + + domainName, err := e.shard.GetDomainCache().GetDomainName(domainID) + if err != nil { + return nil, err + } + + expirationInterval := e.shard.GetConfig().LongPollExpirationInterval(domainName) + if deadline, ok := ctx.Deadline(); ok { + remainingTime := deadline.Sub(e.shard.GetTimeSource().Now()) + // Here we return a safeguard error, to ensure that older clients are not stuck in long poll loop until context fully expires. + // Otherwise it results in multiple additional requests being made that returns empty responses. + // Newer clients will not make request with too small timeout remaining. + if remainingTime < longPollCompletionBuffer { + return nil, context.DeadlineExceeded + } + // longPollCompletionBuffer is here to leave some room to finish current request without its timeout. + expirationInterval = common.MinDuration( + expirationInterval, + remainingTime-longPollCompletionBuffer, + ) + } + if expirationInterval <= 0 { + return response, nil + } + timer := time.NewTimer(expirationInterval) + defer timer.Stop() + for { + select { + case event := <-channel: + response.LastFirstEventID = event.LastFirstEventID + response.NextEventID = event.NextEventID + response.IsWorkflowRunning = event.WorkflowCloseState == persistence.WorkflowCloseStatusNone + response.PreviousStartedEventID = common.Int64Ptr(event.PreviousStartedEventID) + response.WorkflowState = common.Int32Ptr(int32(event.WorkflowState)) + response.WorkflowCloseState = common.Int32Ptr(int32(event.WorkflowCloseState)) + if !bytes.Equal(request.CurrentBranchToken, event.CurrentBranchToken) { + return nil, &types.CurrentBranchChangedError{ + Message: "Current branch token and request branch token doesn't match", + CurrentBranchToken: event.CurrentBranchToken} + } + if expectedNextEventID < response.GetNextEventID() || !response.GetIsWorkflowRunning() { + return response, nil + } + case <-timer.C: + return response, nil + } + } + } + + return response, nil +} diff --git a/service/history/engine/engineimpl/query_workflow.go b/service/history/engine/engineimpl/query_workflow.go new file mode 100644 index 00000000000..15fef7f42e8 --- /dev/null +++ b/service/history/engine/engineimpl/query_workflow.go @@ -0,0 +1,330 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "time" + + "go.uber.org/yarpc/yarpcerrors" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/query" + "github.com/uber/cadence/service/history/workflow" +) + +func (e *historyEngineImpl) QueryWorkflow( + ctx context.Context, + request *types.HistoryQueryWorkflowRequest, +) (retResp *types.HistoryQueryWorkflowResponse, retErr error) { + + scope := e.metricsClient.Scope(metrics.HistoryQueryWorkflowScope).Tagged(metrics.DomainTag(request.GetRequest().GetDomain())) + shardMetricScope := e.metricsClient.Scope(metrics.HistoryQueryWorkflowScope, metrics.ShardIDTag(e.shard.GetShardID())) + + consistentQueryEnabled := e.config.EnableConsistentQuery() && e.config.EnableConsistentQueryByDomain(request.GetRequest().GetDomain()) + if request.GetRequest().GetQueryConsistencyLevel() == types.QueryConsistencyLevelStrong { + if !consistentQueryEnabled { + return nil, workflow.ErrConsistentQueryNotEnabled + } + shardMetricScope.IncCounter(metrics.ConsistentQueryPerShard) + e.logger.SampleInfo("History QueryWorkflow called with QueryConsistencyLevelStrong", e.config.SampleLoggingRate(), tag.ShardID(e.shard.GetShardID()), tag.WorkflowID(request.GetRequest().Execution.WorkflowID), tag.WorkflowDomainName(request.GetRequest().Domain)) + } + + execution := *request.GetRequest().GetExecution() + + mutableStateResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) + if err != nil { + return nil, err + } + req := request.GetRequest() + if !mutableStateResp.GetIsWorkflowRunning() && req.QueryRejectCondition != nil { + notOpenReject := req.GetQueryRejectCondition() == types.QueryRejectConditionNotOpen + closeStatus := mutableStateResp.GetWorkflowCloseState() + notCompletedCleanlyReject := req.GetQueryRejectCondition() == types.QueryRejectConditionNotCompletedCleanly && closeStatus != persistence.WorkflowCloseStatusCompleted + if notOpenReject || notCompletedCleanlyReject { + return &types.HistoryQueryWorkflowResponse{ + Response: &types.QueryWorkflowResponse{ + QueryRejected: &types.QueryRejected{ + CloseStatus: persistence.ToInternalWorkflowExecutionCloseStatus(int(closeStatus)), + }, + }, + }, nil + } + } + + // query cannot be processed unless at least one decision task has finished + // if first decision task has not finished wait for up to a second for it to complete + queryFirstDecisionTaskWaitTime := defaultQueryFirstDecisionTaskWaitTime + ctxDeadline, ok := ctx.Deadline() + if ok { + ctxWaitTime := time.Until(ctxDeadline) - time.Second + if ctxWaitTime > queryFirstDecisionTaskWaitTime { + queryFirstDecisionTaskWaitTime = ctxWaitTime + } + } + deadline := time.Now().Add(queryFirstDecisionTaskWaitTime) + for mutableStateResp.GetPreviousStartedEventID() <= 0 && time.Now().Before(deadline) { + <-time.After(queryFirstDecisionTaskCheckInterval) + mutableStateResp, err = e.getMutableState(ctx, request.GetDomainUUID(), execution) + if err != nil { + return nil, err + } + } + + if mutableStateResp.GetPreviousStartedEventID() <= 0 { + scope.IncCounter(metrics.QueryBeforeFirstDecisionCount) + return nil, workflow.ErrQueryWorkflowBeforeFirstDecision + } + + de, err := e.shard.GetDomainCache().GetDomainByID(request.GetDomainUUID()) + if err != nil { + return nil, err + } + + wfContext, release, err := e.executionCache.GetOrCreateWorkflowExecution(ctx, request.GetDomainUUID(), execution) + if err != nil { + return nil, err + } + defer func() { release(retErr) }() + mutableState, err := wfContext.LoadWorkflowExecution(ctx) + if err != nil { + return nil, err + } + // If history is corrupted, query will be rejected + if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { + return nil, err + } else if corrupted { + return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} + } + + // There are two ways in which queries get dispatched to decider. First, queries can be dispatched on decision tasks. + // These decision tasks potentially contain new events and queries. The events are treated as coming before the query in time. + // The second way in which queries are dispatched to decider is directly through matching; in this approach queries can be + // dispatched to decider immediately even if there are outstanding events that came before the query. The following logic + // is used to determine if a query can be safely dispatched directly through matching or if given the desired consistency + // level must be dispatched on a decision task. There are four cases in which a query can be dispatched directly through + // matching safely, without violating the desired consistency level: + // 1. the domain is not active, in this case history is immutable so a query dispatched at any time is consistent + // 2. the workflow is not running, whenever a workflow is not running dispatching query directly is consistent + // 3. the client requested eventual consistency, in this case there are no consistency requirements so dispatching directly through matching is safe + // 4. if there is no pending or started decision it means no events came before query arrived, so its safe to dispatch directly + isActive, _ := de.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()) + safeToDispatchDirectly := !isActive || + !mutableState.IsWorkflowExecutionRunning() || + req.GetQueryConsistencyLevel() == types.QueryConsistencyLevelEventual || + (!mutableState.HasPendingDecision() && !mutableState.HasInFlightDecision()) + if safeToDispatchDirectly { + release(nil) + msResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) + if err != nil { + return nil, err + } + req.Execution.RunID = msResp.Execution.RunID + return e.queryDirectlyThroughMatching(ctx, msResp, request.GetDomainUUID(), req, scope) + } + + // If we get here it means query could not be dispatched through matching directly, so it must block + // until either an result has been obtained on a decision task response or until it is safe to dispatch directly through matching. + sw := scope.StartTimer(metrics.DecisionTaskQueryLatency) + defer sw.Stop() + queryReg := mutableState.GetQueryRegistry() + if len(queryReg.GetBufferedIDs()) >= e.config.MaxBufferedQueryCount() { + scope.IncCounter(metrics.QueryBufferExceededCount) + return nil, workflow.ErrConsistentQueryBufferExceeded + } + queryID, termCh := queryReg.BufferQuery(req.GetQuery()) + defer queryReg.RemoveQuery(queryID) + release(nil) + select { + case <-termCh: + state, err := queryReg.GetTerminationState(queryID) + if err != nil { + scope.IncCounter(metrics.QueryRegistryInvalidStateCount) + return nil, err + } + switch state.TerminationType { + case query.TerminationTypeCompleted: + result := state.QueryResult + switch result.GetResultType() { + case types.QueryResultTypeAnswered: + return &types.HistoryQueryWorkflowResponse{ + Response: &types.QueryWorkflowResponse{ + QueryResult: result.GetAnswer(), + }, + }, nil + case types.QueryResultTypeFailed: + return nil, &types.QueryFailedError{Message: result.GetErrorMessage()} + default: + scope.IncCounter(metrics.QueryRegistryInvalidStateCount) + return nil, workflow.ErrQueryEnteredInvalidState + } + case query.TerminationTypeUnblocked: + msResp, err := e.getMutableState(ctx, request.GetDomainUUID(), execution) + if err != nil { + return nil, err + } + req.Execution.RunID = msResp.Execution.RunID + return e.queryDirectlyThroughMatching(ctx, msResp, request.GetDomainUUID(), req, scope) + case query.TerminationTypeFailed: + return nil, state.Failure + default: + scope.IncCounter(metrics.QueryRegistryInvalidStateCount) + return nil, workflow.ErrQueryEnteredInvalidState + } + case <-ctx.Done(): + scope.IncCounter(metrics.ConsistentQueryTimeoutCount) + return nil, ctx.Err() + } +} + +func (e *historyEngineImpl) queryDirectlyThroughMatching( + ctx context.Context, + msResp *types.GetMutableStateResponse, + domainID string, + queryRequest *types.QueryWorkflowRequest, + scope metrics.Scope, +) (*types.HistoryQueryWorkflowResponse, error) { + + sw := scope.StartTimer(metrics.DirectQueryDispatchLatency) + defer sw.Stop() + + // Sticky task list is not very useful in the standby cluster because the decider cache is + // not updated by dispatching tasks to it (it is only updated in the case of query). + // Additionally on the standby side we are not even able to clear sticky. + // Stickiness might be outdated if the customer did a restart of their nodes causing a query + // dispatched on the standby side on sticky to hang. We decided it made sense to simply not attempt + // query on sticky task list at all on the passive side. + de, err := e.shard.GetDomainCache().GetDomainByID(domainID) + if err != nil { + return nil, err + } + supportsStickyQuery := e.clientChecker.SupportsStickyQuery(msResp.GetClientImpl(), msResp.GetClientFeatureVersion()) == nil + domainIsActive, _ := de.IsActiveIn(e.clusterMetadata.GetCurrentClusterName()) + if msResp.GetIsStickyTaskListEnabled() && + len(msResp.GetStickyTaskList().GetName()) != 0 && + supportsStickyQuery && + e.config.EnableStickyQuery(queryRequest.GetDomain()) && + domainIsActive { + + stickyMatchingRequest := &types.MatchingQueryWorkflowRequest{ + DomainUUID: domainID, + QueryRequest: queryRequest, + TaskList: msResp.GetStickyTaskList(), + } + + // using a clean new context in case customer provide a context which has + // a really short deadline, causing we clear the stickiness + stickyContext, cancel := context.WithTimeout(context.Background(), time.Duration(msResp.GetStickyTaskListScheduleToStartTimeout())*time.Second) + stickyStopWatch := scope.StartTimer(metrics.DirectQueryDispatchStickyLatency) + matchingResp, err := e.rawMatchingClient.QueryWorkflow(stickyContext, stickyMatchingRequest) + stickyStopWatch.Stop() + cancel() + if err == nil { + scope.IncCounter(metrics.DirectQueryDispatchStickySuccessCount) + return &types.HistoryQueryWorkflowResponse{Response: matchingResp}, nil + } + switch v := err.(type) { + case *types.StickyWorkerUnavailableError: + case *yarpcerrors.Status: + if v.Code() != yarpcerrors.CodeDeadlineExceeded { + e.logger.Error("query directly though matching on sticky failed, will not attempt query on non-sticky", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), + tag.Error(err)) + return nil, err + } + default: + e.logger.Error("query directly though matching on sticky failed, will not attempt query on non-sticky", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), + tag.Error(err)) + return nil, err + } + if msResp.GetIsWorkflowRunning() { + e.logger.Info("query direct through matching failed on sticky, clearing sticky before attempting on non-sticky", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), + tag.Error(err)) + resetContext, cancel := context.WithTimeout(context.Background(), 5*time.Second) + clearStickinessStopWatch := scope.StartTimer(metrics.DirectQueryDispatchClearStickinessLatency) + _, err := e.ResetStickyTaskList(resetContext, &types.HistoryResetStickyTaskListRequest{ + DomainUUID: domainID, + Execution: queryRequest.GetExecution(), + }) + clearStickinessStopWatch.Stop() + cancel() + if err != nil && err != workflow.ErrAlreadyCompleted && err != workflow.ErrNotExists { + return nil, err + } + scope.IncCounter(metrics.DirectQueryDispatchClearStickinessSuccessCount) + } + } + + if err := common.IsValidContext(ctx); err != nil { + e.logger.Info("query context timed out before query on non-sticky task list could be attempted", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType())) + scope.IncCounter(metrics.DirectQueryDispatchTimeoutBeforeNonStickyCount) + return nil, err + } + + e.logger.Debug("query directly through matching on sticky timed out, attempting to query on non-sticky", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), + tag.WorkflowTaskListName(msResp.GetStickyTaskList().GetName()), + tag.WorkflowNextEventID(msResp.GetNextEventID())) + + nonStickyMatchingRequest := &types.MatchingQueryWorkflowRequest{ + DomainUUID: domainID, + QueryRequest: queryRequest, + TaskList: msResp.TaskList, + } + + nonStickyStopWatch := scope.StartTimer(metrics.DirectQueryDispatchNonStickyLatency) + matchingResp, err := e.matchingClient.QueryWorkflow(ctx, nonStickyMatchingRequest) + nonStickyStopWatch.Stop() + if err != nil { + e.logger.Error("query directly though matching on non-sticky failed", + tag.WorkflowDomainName(queryRequest.GetDomain()), + tag.WorkflowID(queryRequest.Execution.GetWorkflowID()), + tag.WorkflowRunID(queryRequest.Execution.GetRunID()), + tag.WorkflowQueryType(queryRequest.Query.GetQueryType()), + tag.Error(err)) + return nil, err + } + scope.IncCounter(metrics.DirectQueryDispatchNonStickySuccessCount) + return &types.HistoryQueryWorkflowResponse{Response: matchingResp}, err +} diff --git a/service/history/engine/engineimpl/reapply_events.go b/service/history/engine/engineimpl/reapply_events.go new file mode 100644 index 00000000000..a4497752a37 --- /dev/null +++ b/service/history/engine/engineimpl/reapply_events.go @@ -0,0 +1,180 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/pborman/uuid" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/definition" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/ndc" + "github.com/uber/cadence/service/history/workflow" +) + +func (e *historyEngineImpl) ReapplyEvents( + ctx context.Context, + domainUUID string, + workflowID string, + runID string, + reapplyEvents []*types.HistoryEvent, +) error { + + domainEntry, err := e.getActiveDomainByID(domainUUID) + if err != nil { + switch { + case domainEntry != nil && domainEntry.IsDomainPendingActive(): + return nil + default: + return err + } + } + domainID := domainEntry.GetInfo().ID + // remove run id from the execution so that reapply events to the current run + currentExecution := types.WorkflowExecution{ + WorkflowID: workflowID, + } + + return workflow.UpdateWithActionFunc( + ctx, + e.executionCache, + domainID, + currentExecution, + e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { + // Filter out reapply event from the same cluster + toReapplyEvents := make([]*types.HistoryEvent, 0, len(reapplyEvents)) + lastWriteVersion, err := mutableState.GetLastWriteVersion() + if err != nil { + return nil, err + } + for _, event := range reapplyEvents { + if event.Version == lastWriteVersion { + // The reapply is from the same cluster. Ignoring. + continue + } + dedupResource := definition.NewEventReappliedID(runID, event.ID, event.Version) + if mutableState.IsResourceDuplicated(dedupResource) { + // already apply the signal + continue + } + toReapplyEvents = append(toReapplyEvents, event) + } + if len(toReapplyEvents) == 0 { + return &workflow.UpdateAction{ + Noop: true, + }, nil + } + + if !mutableState.IsWorkflowExecutionRunning() { + // need to reset target workflow (which is also the current workflow) + // to accept events to be reapplied + baseRunID := mutableState.GetExecutionInfo().RunID + resetRunID := uuid.New() + baseRebuildLastEventID := mutableState.GetPreviousStartedEventID() + + // TODO when https://github.com/uber/cadence/issues/2420 is finished, remove this block, + // since cannot reapply event to a finished workflow which had no decisions started + if baseRebuildLastEventID == common.EmptyEventID { + e.logger.Warn("cannot reapply event to a finished workflow", + tag.WorkflowDomainID(domainID), + tag.WorkflowID(currentExecution.GetWorkflowID()), + ) + e.metricsClient.IncCounter(metrics.HistoryReapplyEventsScope, metrics.EventReapplySkippedCount) + return &workflow.UpdateAction{Noop: true}, nil + } + + baseVersionHistories := mutableState.GetVersionHistories() + if baseVersionHistories == nil { + return nil, execution.ErrMissingVersionHistories + } + baseCurrentVersionHistory, err := baseVersionHistories.GetCurrentVersionHistory() + if err != nil { + return nil, err + } + baseRebuildLastEventVersion, err := baseCurrentVersionHistory.GetEventVersion(baseRebuildLastEventID) + if err != nil { + return nil, err + } + baseCurrentBranchToken := baseCurrentVersionHistory.GetBranchToken() + baseNextEventID := mutableState.GetNextEventID() + + if err = e.workflowResetter.ResetWorkflow( + ctx, + domainID, + workflowID, + baseRunID, + baseCurrentBranchToken, + baseRebuildLastEventID, + baseRebuildLastEventVersion, + baseNextEventID, + resetRunID, + uuid.New(), + execution.NewWorkflow( + ctx, + e.shard.GetClusterMetadata(), + wfContext, + mutableState, + execution.NoopReleaseFn, + ), + ndc.EventsReapplicationResetWorkflowReason, + toReapplyEvents, + false, + ); err != nil { + return nil, err + } + return &workflow.UpdateAction{ + Noop: true, + }, nil + } + + postActions := &workflow.UpdateAction{ + CreateDecision: true, + } + // Do not create decision task when the workflow is cron and the cron has not been started yet + if mutableState.GetExecutionInfo().CronSchedule != "" && !mutableState.HasProcessedOrPendingDecision() { + postActions.CreateDecision = false + } + reappliedEvents, err := e.eventsReapplier.ReapplyEvents( + ctx, + mutableState, + toReapplyEvents, + runID, + ) + if err != nil { + e.logger.Error("failed to re-apply stale events", tag.Error(err)) + return nil, &types.InternalServiceError{Message: "unable to re-apply stale events"} + } + if len(reappliedEvents) == 0 { + return &workflow.UpdateAction{ + Noop: true, + }, nil + } + return postActions, nil + }, + ) +} diff --git a/service/history/engine/engineimpl/record_activity_task_started.go b/service/history/engine/engineimpl/record_activity_task_started.go new file mode 100644 index 00000000000..ccb3d47d430 --- /dev/null +++ b/service/history/engine/engineimpl/record_activity_task_started.go @@ -0,0 +1,169 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +func (e *historyEngineImpl) RecordActivityTaskStarted( + ctx context.Context, + request *types.RecordActivityTaskStartedRequest, +) (*types.RecordActivityTaskStartedResponse, error) { + + domainEntry, err := e.getActiveDomainByID(request.DomainUUID) + if err != nil { + return nil, err + } + + domainInfo := domainEntry.GetInfo() + + domainID := domainInfo.ID + domainName := domainInfo.Name + + workflowExecution := types.WorkflowExecution{ + WorkflowID: request.WorkflowExecution.WorkflowID, + RunID: request.WorkflowExecution.RunID, + } + + var resurrectError error + response := &types.RecordActivityTaskStartedResponse{} + err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrNotExists + } + + scheduleID := request.GetScheduleID() + requestID := request.GetRequestID() + ai, isRunning := mutableState.GetActivityInfo(scheduleID) + + // RecordActivityTaskStarted is already past scheduleToClose timeout. + // If at this point pending activity is still in mutable state it may be resurrected. + // Otherwise it would be completed or timed out already. + if isRunning && e.timeSource.Now().After(ai.ScheduledTime.Add(time.Duration(ai.ScheduleToCloseTimeout)*time.Second)) { + resurrectedActivities, err := execution.GetResurrectedActivities(ctx, e.shard, mutableState) + if err != nil { + e.logger.Error("Activity resurrection check failed", tag.Error(err)) + return err + } + + if _, ok := resurrectedActivities[scheduleID]; ok { + // found activity resurrection + domainName := mutableState.GetDomainEntry().GetInfo().Name + e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskStartedScope, metrics.ActivityResurrectionCounter) + e.logger.Error("Encounter resurrected activity, skip", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + ) + + // remove resurrected activity from mutable state + if err := mutableState.DeleteActivity(scheduleID); err != nil { + return err + } + + // save resurrection error but return nil here, so that mutable state would get updated in DB + resurrectError = workflow.ErrActivityTaskNotFound + return nil + } + } + + // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in + // some extreme cassandra failure cases. + if !isRunning && scheduleID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskStartedScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordActivityTaskStarted", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + + // Check execution state to make sure task is in the list of outstanding tasks and it is not yet started. If + // task is not outstanding than it is most probably a duplicate and complete the task. + if !isRunning { + // Looks like ActivityTask already completed as a result of another call. + // It is OK to drop the task at this point. + e.logger.Debug("Potentially duplicate task.", tag.TaskID(request.GetTaskID()), tag.WorkflowScheduleID(scheduleID), tag.TaskType(persistence.TransferTaskTypeActivityTask)) + return workflow.ErrActivityTaskNotFound + } + + scheduledEvent, err := mutableState.GetActivityScheduledEvent(ctx, scheduleID) + if err != nil { + return err + } + response.ScheduledEvent = scheduledEvent + response.ScheduledTimestampOfThisAttempt = common.Int64Ptr(ai.ScheduledTime.UnixNano()) + + response.Attempt = int64(ai.Attempt) + response.HeartbeatDetails = ai.Details + + response.WorkflowType = mutableState.GetWorkflowType() + response.WorkflowDomain = domainName + + if ai.StartedID != common.EmptyEventID { + // If activity is started as part of the current request scope then return a positive response + if ai.RequestID == requestID { + response.StartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) + return nil + } + + // Looks like ActivityTask already started as a result of another call. + // It is OK to drop the task at this point. + e.logger.Debug("Potentially duplicate task.", tag.TaskID(request.GetTaskID()), tag.WorkflowScheduleID(scheduleID), tag.TaskType(persistence.TransferTaskTypeActivityTask)) + return &types.EventAlreadyStartedError{Message: "Activity task already started."} + } + + if _, err := mutableState.AddActivityTaskStartedEvent( + ai, scheduleID, requestID, request.PollRequest.GetIdentity(), + ); err != nil { + return err + } + + response.StartedTimestamp = common.Int64Ptr(ai.StartedTime.UnixNano()) + + return nil + }) + + if err != nil { + return nil, err + } + if resurrectError != nil { + return nil, resurrectError + } + + return response, err +} diff --git a/service/history/engine/engineimpl/record_child_execution_completed.go b/service/history/engine/engineimpl/record_child_execution_completed.go new file mode 100644 index 00000000000..9aaef637017 --- /dev/null +++ b/service/history/engine/engineimpl/record_child_execution_completed.go @@ -0,0 +1,118 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RecordChildExecutionCompleted records the completion of child execution into parent execution history +func (e *historyEngineImpl) RecordChildExecutionCompleted( + ctx context.Context, + completionRequest *types.RecordChildExecutionCompletedRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(completionRequest.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + + workflowExecution := types.WorkflowExecution{ + WorkflowID: completionRequest.WorkflowExecution.GetWorkflowID(), + RunID: completionRequest.WorkflowExecution.GetRunID(), + } + + return e.updateWithActionFn(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrNotExists + } + + initiatedID := completionRequest.InitiatedID + startedID := completionRequest.StartedID + completedExecution := completionRequest.CompletedExecution + completionEvent := completionRequest.CompletionEvent + + // Check mutable state to make sure child execution is in pending child executions + ci, isRunning := mutableState.GetChildExecutionInfo(initiatedID) + if !isRunning { + if initiatedID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRecordChildExecutionCompletedScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordChildExecutionCompleted", + tag.WorkflowDomainName(domainEntry.GetInfo().Name), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowInitiatedID(initiatedID), + tag.WorkflowStartedID(startedID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + return &types.EntityNotExistsError{Message: "Pending child execution not found."} + } + if ci.StartedID == common.EmptyEventID { + if startedID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRecordChildExecutionCompletedScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordChildExecutionCompleted", + tag.WorkflowDomainName(domainEntry.GetInfo().Name), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowInitiatedID(initiatedID), + tag.WorkflowStartedID(startedID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + return &types.EntityNotExistsError{Message: "Pending child execution not started."} + } + if ci.StartedWorkflowID != completedExecution.GetWorkflowID() { + return &types.EntityNotExistsError{Message: "Pending child execution workflowID mismatch."} + } + + switch *completionEvent.EventType { + case types.EventTypeWorkflowExecutionCompleted: + attributes := completionEvent.WorkflowExecutionCompletedEventAttributes + _, err = mutableState.AddChildWorkflowExecutionCompletedEvent(initiatedID, completedExecution, attributes) + case types.EventTypeWorkflowExecutionFailed: + attributes := completionEvent.WorkflowExecutionFailedEventAttributes + _, err = mutableState.AddChildWorkflowExecutionFailedEvent(initiatedID, completedExecution, attributes) + case types.EventTypeWorkflowExecutionCanceled: + attributes := completionEvent.WorkflowExecutionCanceledEventAttributes + _, err = mutableState.AddChildWorkflowExecutionCanceledEvent(initiatedID, completedExecution, attributes) + case types.EventTypeWorkflowExecutionTerminated: + attributes := completionEvent.WorkflowExecutionTerminatedEventAttributes + _, err = mutableState.AddChildWorkflowExecutionTerminatedEvent(initiatedID, completedExecution, attributes) + case types.EventTypeWorkflowExecutionTimedOut: + attributes := completionEvent.WorkflowExecutionTimedOutEventAttributes + _, err = mutableState.AddChildWorkflowExecutionTimedOutEvent(initiatedID, completedExecution, attributes) + } + return err + }) +} diff --git a/service/history/engine/engineimpl/record_decision_task_started.go b/service/history/engine/engineimpl/record_decision_task_started.go new file mode 100644 index 00000000000..f565211c7a8 --- /dev/null +++ b/service/history/engine/engineimpl/record_decision_task_started.go @@ -0,0 +1,33 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" +) + +// RecordDecisionTaskStarted starts a decision +func (e *historyEngineImpl) RecordDecisionTaskStarted(ctx context.Context, request *types.RecordDecisionTaskStartedRequest) (*types.RecordDecisionTaskStartedResponse, error) { + return e.decisionHandler.HandleDecisionTaskStarted(ctx, request) +} diff --git a/service/history/engine/engineimpl/refresh_workflow_tasks.go b/service/history/engine/engineimpl/refresh_workflow_tasks.go new file mode 100644 index 00000000000..462f4c2369b --- /dev/null +++ b/service/history/engine/engineimpl/refresh_workflow_tasks.go @@ -0,0 +1,71 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" +) + +func (e *historyEngineImpl) RefreshWorkflowTasks( + ctx context.Context, + domainUUID string, + workflowExecution types.WorkflowExecution, +) (retError error) { + domainEntry, err := e.shard.GetDomainCache().GetDomainByID(domainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + + wfContext, release, err := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, workflowExecution) + if err != nil { + return err + } + defer func() { release(retError) }() + + mutableState, err := wfContext.LoadWorkflowExecution(ctx) + if err != nil { + return err + } + + mutableStateTaskRefresher := execution.NewMutableStateTaskRefresher( + e.shard.GetConfig(), + e.shard.GetClusterMetadata(), + e.shard.GetDomainCache(), + e.shard.GetEventsCache(), + e.shard.GetShardID(), + ) + + err = mutableStateTaskRefresher.RefreshTasks(ctx, mutableState.GetExecutionInfo().StartTimestamp, mutableState) + if err != nil { + return err + } + + err = wfContext.UpdateWorkflowExecutionTasks(ctx, e.shard.GetTimeSource().Now()) + if err != nil { + return err + } + return nil +} diff --git a/service/history/engine/engineimpl/register_domain_failover_callback.go b/service/history/engine/engineimpl/register_domain_failover_callback.go new file mode 100644 index 00000000000..e39380eaa82 --- /dev/null +++ b/service/history/engine/engineimpl/register_domain_failover_callback.go @@ -0,0 +1,160 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/cache" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/persistence" + hcommon "github.com/uber/cadence/service/history/common" +) + +func (e *historyEngineImpl) registerDomainFailoverCallback() { + + // NOTE: READ BEFORE MODIFICATION + // + // Tasks, e.g. transfer tasks and timer tasks, are created when holding the shard lock + // meaning tasks -> release of shard lock + // + // Domain change notification follows the following steps, order matters + // 1. lock all task processing. + // 2. domain changes visible to everyone (Note: lock of task processing prevents task processing logic seeing the domain changes). + // 3. failover min and max task levels are calculated, then update to shard. + // 4. failover start & task processing unlock & shard domain version notification update. (order does not matter for this discussion) + // + // The above guarantees that task created during the failover will be processed. + // If the task is created after domain change: + // then active processor will handle it. (simple case) + // If the task is created before domain change: + // task -> release of shard lock + // failover min / max task levels calculated & updated to shard (using shard lock) -> failover start + // above 2 guarantees that failover start is after persistence of the task. + + failoverPredicate := func(shardNotificationVersion int64, nextDomain *cache.DomainCacheEntry, action func()) { + domainFailoverNotificationVersion := nextDomain.GetFailoverNotificationVersion() + domainActiveCluster := nextDomain.GetReplicationConfig().ActiveClusterName + + if nextDomain.IsGlobalDomain() && + domainFailoverNotificationVersion >= shardNotificationVersion && + domainActiveCluster == e.currentClusterName { + action() + } + } + + // first set the failover callback + e.shard.GetDomainCache().RegisterDomainChangeCallback( + e.shard.GetShardID(), + e.shard.GetDomainNotificationVersion(), + func() { + e.txProcessor.LockTaskProcessing() + e.timerProcessor.LockTaskProcessing() + // there no lock/unlock for crossClusterProcessor + }, + func(nextDomains []*cache.DomainCacheEntry) { + defer func() { + e.txProcessor.UnlockTaskProcessing() + e.timerProcessor.UnlockTaskProcessing() + // there no lock/unlock for crossClusterProcessor + }() + + if len(nextDomains) == 0 { + return + } + + shardNotificationVersion := e.shard.GetDomainNotificationVersion() + failoverDomainIDs := map[string]struct{}{} + + for _, nextDomain := range nextDomains { + failoverPredicate(shardNotificationVersion, nextDomain, func() { + failoverDomainIDs[nextDomain.GetInfo().ID] = struct{}{} + }) + } + + if len(failoverDomainIDs) > 0 { + e.logger.Info("Domain Failover Start.", tag.WorkflowDomainIDs(failoverDomainIDs)) + + e.txProcessor.FailoverDomain(failoverDomainIDs) + e.timerProcessor.FailoverDomain(failoverDomainIDs) + e.crossClusterProcessor.FailoverDomain(failoverDomainIDs) + + now := e.shard.GetTimeSource().Now() + // the fake tasks will not be actually used, we just need to make sure + // its length > 0 and has correct timestamp, to trigger a db scan + fakeDecisionTask := []persistence.Task{&persistence.DecisionTask{}} + fakeDecisionTimeoutTask := []persistence.Task{&persistence.DecisionTimeoutTask{TaskData: persistence.TaskData{VisibilityTimestamp: now}}} + e.txProcessor.NotifyNewTask(e.currentClusterName, &hcommon.NotifyTaskInfo{Tasks: fakeDecisionTask}) + e.timerProcessor.NotifyNewTask(e.currentClusterName, &hcommon.NotifyTaskInfo{Tasks: fakeDecisionTimeoutTask}) + } + + // handle graceful failover on active to passive + // make sure task processor failover the domain before inserting the failover marker + failoverMarkerTasks := []*persistence.FailoverMarkerTask{} + for _, nextDomain := range nextDomains { + domainFailoverNotificationVersion := nextDomain.GetFailoverNotificationVersion() + domainActiveCluster := nextDomain.GetReplicationConfig().ActiveClusterName + previousFailoverVersion := nextDomain.GetPreviousFailoverVersion() + previousClusterName, err := e.clusterMetadata.ClusterNameForFailoverVersion(previousFailoverVersion) + if err != nil { + e.logger.Error("Failed to handle graceful failover", tag.WorkflowDomainID(nextDomain.GetInfo().ID), tag.Error(err)) + continue + } + + if nextDomain.IsGlobalDomain() && + domainFailoverNotificationVersion >= shardNotificationVersion && + domainActiveCluster != e.currentClusterName && + previousFailoverVersion != common.InitialPreviousFailoverVersion && + previousClusterName == e.currentClusterName { + // the visibility timestamp will be set in shard context + failoverMarkerTasks = append(failoverMarkerTasks, &persistence.FailoverMarkerTask{ + TaskData: persistence.TaskData{ + Version: nextDomain.GetFailoverVersion(), + }, + DomainID: nextDomain.GetInfo().ID, + }) + // This is a debug metric + e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.FailoverMarkerCallbackCount) + } + } + + // This is a debug metric + e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.HistoryFailoverCallbackCount) + if len(failoverMarkerTasks) > 0 { + if err := e.shard.ReplicateFailoverMarkers( + context.Background(), + failoverMarkerTasks, + ); err != nil { + e.logger.Error("Failed to insert failover marker to replication queue.", tag.Error(err)) + e.metricsClient.IncCounter(metrics.FailoverMarkerScope, metrics.FailoverMarkerInsertFailure) + // fail this failover callback and it retries on next domain cache refresh + return + } + } + + //nolint:errcheck + e.shard.UpdateDomainNotificationVersion(nextDomains[len(nextDomains)-1].GetNotificationVersion() + 1) + }, + ) +} diff --git a/service/history/engine/engineimpl/remove_signal_mutable_state.go b/service/history/engine/engineimpl/remove_signal_mutable_state.go new file mode 100644 index 00000000000..def01dcfdec --- /dev/null +++ b/service/history/engine/engineimpl/remove_signal_mutable_state.go @@ -0,0 +1,59 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RemoveSignalMutableState remove the signal request id in signal_requested for deduplicate +func (e *historyEngineImpl) RemoveSignalMutableState( + ctx context.Context, + request *types.RemoveSignalMutableStateRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(request.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + + workflowExecution := types.WorkflowExecution{ + WorkflowID: request.WorkflowExecution.WorkflowID, + RunID: request.WorkflowExecution.RunID, + } + + return workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrNotExists + } + + mutableState.DeleteSignalRequested(request.GetRequestID()) + + return nil + }) +} diff --git a/service/history/engine/engineimpl/request_cancel_workflow_execution.go b/service/history/engine/engineimpl/request_cancel_workflow_execution.go new file mode 100644 index 00000000000..d9263b79f87 --- /dev/null +++ b/service/history/engine/engineimpl/request_cancel_workflow_execution.go @@ -0,0 +1,112 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RequestCancelWorkflowExecution records request cancellation event for workflow execution +func (e *historyEngineImpl) RequestCancelWorkflowExecution( + ctx context.Context, + req *types.HistoryRequestCancelWorkflowExecutionRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(req.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + + request := req.CancelRequest + parentExecution := req.ExternalWorkflowExecution + childWorkflowOnly := req.GetChildWorkflowOnly() + workflowExecution := types.WorkflowExecution{ + WorkflowID: request.WorkflowExecution.WorkflowID, + } + // If firstExecutionRunID is set on the request always try to cancel currently running execution + if request.GetFirstExecutionRunID() == "" { + workflowExecution.RunID = request.WorkflowExecution.RunID + } + + return workflow.UpdateCurrentWithActionFunc(ctx, e.executionCache, e.executionManager, domainID, e.shard.GetDomainCache(), workflowExecution, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { + isCancelRequested, cancelRequestID := mutableState.IsCancelRequested() + if !mutableState.IsWorkflowExecutionRunning() { + _, closeStatus := mutableState.GetWorkflowStateCloseStatus() + if isCancelRequested && closeStatus == persistence.WorkflowCloseStatusCanceled { + cancelRequest := req.CancelRequest + if cancelRequest.RequestID != "" && cancelRequest.RequestID == cancelRequestID { + return &workflow.UpdateAction{Noop: true}, nil + } + } + return nil, workflow.ErrAlreadyCompleted + } + + executionInfo := mutableState.GetExecutionInfo() + if request.GetFirstExecutionRunID() != "" { + firstRunID := executionInfo.FirstExecutionRunID + if firstRunID == "" { + // This is needed for backwards compatibility. Workflow execution create with Cadence release v0.25.0 or earlier + // does not have FirstExecutionRunID stored as part of mutable state. If this is not set then load it from + // workflow execution started event. + startEvent, err := mutableState.GetStartEvent(ctx) + if err != nil { + return nil, err + } + firstRunID = startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstExecutionRunID() + } + if request.GetFirstExecutionRunID() != firstRunID { + return nil, &types.EntityNotExistsError{Message: "Workflow execution not found"} + } + } + if childWorkflowOnly { + parentWorkflowID := executionInfo.ParentWorkflowID + parentRunID := executionInfo.ParentRunID + if parentExecution.GetWorkflowID() != parentWorkflowID || + parentExecution.GetRunID() != parentRunID { + return nil, workflow.ErrParentMismatch + } + } + + if isCancelRequested { + cancelRequest := req.CancelRequest + if cancelRequest.RequestID != "" && cancelRequest.RequestID == cancelRequestID { + return workflow.UpdateWithNewDecision, nil + } + // if we consider workflow cancellation idempotent, then this error is redundant + // this error maybe useful if this API is invoked by external, not decision from transfer queue + return nil, workflow.ErrCancellationAlreadyRequested + } + + if _, err := mutableState.AddWorkflowExecutionCancelRequestedEvent(req.CancelRequest.Cause, req); err != nil { + return nil, &types.InternalServiceError{Message: "Unable to cancel workflow execution."} + } + + return workflow.UpdateWithNewDecision, nil + }) +} diff --git a/service/history/engine/engineimpl/reset_queues.go b/service/history/engine/engineimpl/reset_queues.go new file mode 100644 index 00000000000..a1b9c38a68c --- /dev/null +++ b/service/history/engine/engineimpl/reset_queues.go @@ -0,0 +1,52 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/service/history/queue" +) + +func (e *historyEngineImpl) ResetTransferQueue( + ctx context.Context, + clusterName string, +) error { + _, err := e.txProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) + return err +} + +func (e *historyEngineImpl) ResetTimerQueue( + ctx context.Context, + clusterName string, +) error { + _, err := e.timerProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) + return err +} + +func (e *historyEngineImpl) ResetCrossClusterQueue( + ctx context.Context, + clusterName string, +) error { + _, err := e.crossClusterProcessor.HandleAction(ctx, clusterName, queue.NewResetAction()) + return err +} diff --git a/service/history/engine/engineimpl/reset_sticky_tasklist.go b/service/history/engine/engineimpl/reset_sticky_tasklist.go new file mode 100644 index 00000000000..8756ef05c6f --- /dev/null +++ b/service/history/engine/engineimpl/reset_sticky_tasklist.go @@ -0,0 +1,64 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// ResetStickyTaskList reset the volatile information in mutable state of a given types. +// Volatile information are the information related to client, such as: +// 1. StickyTaskList +// 2. StickyScheduleToStartTimeout +// 3. ClientLibraryVersion +// 4. ClientFeatureVersion +// 5. ClientImpl +func (e *historyEngineImpl) ResetStickyTaskList( + ctx context.Context, + resetRequest *types.HistoryResetStickyTaskListRequest, +) (*types.HistoryResetStickyTaskListResponse, error) { + + if err := common.ValidateDomainUUID(resetRequest.DomainUUID); err != nil { + return nil, err + } + domainID := resetRequest.DomainUUID + + err := workflow.UpdateWithAction(ctx, e.executionCache, domainID, *resetRequest.Execution, false, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrAlreadyCompleted + } + mutableState.ClearStickyness() + return nil + }, + ) + + if err != nil { + return nil, err + } + return &types.HistoryResetStickyTaskListResponse{}, nil +} diff --git a/service/history/engine/engineimpl/reset_workflow_execution.go b/service/history/engine/engineimpl/reset_workflow_execution.go new file mode 100644 index 00000000000..274e96aa5d8 --- /dev/null +++ b/service/history/engine/engineimpl/reset_workflow_execution.go @@ -0,0 +1,184 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/pborman/uuid" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" +) + +func (e *historyEngineImpl) ResetWorkflowExecution( + ctx context.Context, + resetRequest *types.HistoryResetWorkflowExecutionRequest, +) (response *types.ResetWorkflowExecutionResponse, retError error) { + + request := resetRequest.ResetRequest + domainID := resetRequest.GetDomainUUID() + workflowID := request.WorkflowExecution.GetWorkflowID() + baseRunID := request.WorkflowExecution.GetRunID() + + baseContext, baseReleaseFn, err := e.executionCache.GetOrCreateWorkflowExecution( + ctx, + domainID, + types.WorkflowExecution{ + WorkflowID: workflowID, + RunID: baseRunID, + }, + ) + if err != nil { + return nil, err + } + defer func() { baseReleaseFn(retError) }() + + baseMutableState, err := baseContext.LoadWorkflowExecution(ctx) + if err != nil { + return nil, err + } + if ok := baseMutableState.HasProcessedOrPendingDecision(); !ok { + return nil, &types.BadRequestError{ + Message: "Cannot reset workflow without a decision task schedule.", + } + } + if request.GetDecisionFinishEventID() <= common.FirstEventID || + request.GetDecisionFinishEventID() > baseMutableState.GetNextEventID() { + return nil, &types.BadRequestError{ + Message: "Decision finish ID must be > 1 && <= workflow next event ID.", + } + } + domainName, err := e.shard.GetDomainCache().GetDomainName(domainID) + if err != nil { + return nil, err + } + // also load the current run of the workflow, it can be different from the base runID + resp, err := e.executionManager.GetCurrentExecution(ctx, &persistence.GetCurrentExecutionRequest{ + DomainID: domainID, + WorkflowID: request.WorkflowExecution.GetWorkflowID(), + DomainName: domainName, + }) + if err != nil { + return nil, err + } + + currentRunID := resp.RunID + var currentContext execution.Context + var currentMutableState execution.MutableState + var currentReleaseFn execution.ReleaseFunc + if currentRunID == baseRunID { + currentContext = baseContext + currentMutableState = baseMutableState + } else { + currentContext, currentReleaseFn, err = e.executionCache.GetOrCreateWorkflowExecution( + ctx, + domainID, + types.WorkflowExecution{ + WorkflowID: workflowID, + RunID: currentRunID, + }, + ) + if err != nil { + return nil, err + } + defer func() { currentReleaseFn(retError) }() + + currentMutableState, err = currentContext.LoadWorkflowExecution(ctx) + if err != nil { + return nil, err + } + } + + // dedup by requestID + if currentMutableState.GetExecutionInfo().CreateRequestID == request.GetRequestID() { + e.logger.Info("Duplicated reset request", + tag.WorkflowID(workflowID), + tag.WorkflowRunID(currentRunID), + tag.WorkflowDomainID(domainID)) + return &types.ResetWorkflowExecutionResponse{ + RunID: currentRunID, + }, nil + } + + resetRunID := uuid.New() + baseRebuildLastEventID := request.GetDecisionFinishEventID() - 1 + baseVersionHistories := baseMutableState.GetVersionHistories() + baseCurrentBranchToken, err := baseMutableState.GetCurrentBranchToken() + if err != nil { + return nil, err + } + baseRebuildLastEventVersion := baseMutableState.GetCurrentVersion() + baseNextEventID := baseMutableState.GetNextEventID() + + if baseVersionHistories != nil { + baseCurrentVersionHistory, err := baseVersionHistories.GetCurrentVersionHistory() + if err != nil { + return nil, err + } + baseRebuildLastEventVersion, err = baseCurrentVersionHistory.GetEventVersion(baseRebuildLastEventID) + if err != nil { + return nil, err + } + baseCurrentBranchToken = baseCurrentVersionHistory.GetBranchToken() + } + + if err := e.workflowResetter.ResetWorkflow( + ctx, + domainID, + workflowID, + baseRunID, + baseCurrentBranchToken, + baseRebuildLastEventID, + baseRebuildLastEventVersion, + baseNextEventID, + resetRunID, + request.GetRequestID(), + execution.NewWorkflow( + ctx, + e.shard.GetClusterMetadata(), + currentContext, + currentMutableState, + currentReleaseFn, + ), + request.GetReason(), + nil, + request.GetSkipSignalReapply(), + ); err != nil { + if t, ok := persistence.AsDuplicateRequestError(err); ok { + if t.RequestType == persistence.WorkflowRequestTypeReset { + return &types.ResetWorkflowExecutionResponse{ + RunID: t.RunID, + }, nil + } + e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) + return nil, t + } + return nil, err + } + return &types.ResetWorkflowExecutionResponse{ + RunID: resetRunID, + }, nil +} diff --git a/service/history/engine/engineimpl/respond_activity_task_canceled.go b/service/history/engine/engineimpl/respond_activity_task_canceled.go new file mode 100644 index 00000000000..35e9d1c7a68 --- /dev/null +++ b/service/history/engine/engineimpl/respond_activity_task_canceled.go @@ -0,0 +1,121 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RespondActivityTaskCanceled completes an activity task failure. +func (e *historyEngineImpl) RespondActivityTaskCanceled( + ctx context.Context, + req *types.HistoryRespondActivityTaskCanceledRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(req.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + domainName := domainEntry.GetInfo().Name + + request := req.CancelRequest + token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) + if err0 != nil { + return workflow.ErrDeserializingToken + } + + workflowExecution := types.WorkflowExecution{ + WorkflowID: token.WorkflowID, + RunID: token.RunID, + } + + var activityStartedTime time.Time + var taskList string + err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrAlreadyCompleted + } + + scheduleID := token.ScheduleID + if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID + scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) + if err0 != nil { + return err0 + } + } + ai, isRunning := mutableState.GetActivityInfo(scheduleID) + + // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in + // some extreme cassandra failure cases. + if !isRunning && scheduleID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskCanceledScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordActivityTaskCanceled", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + + if !isRunning || ai.StartedID == common.EmptyEventID || + (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { + return workflow.ErrActivityTaskNotFound + } + + if _, err := mutableState.AddActivityTaskCanceledEvent( + scheduleID, + ai.StartedID, + ai.CancelRequestID, + request.Details, + request.Identity); err != nil { + // Unable to add ActivityTaskCanceled event to history + return &types.InternalServiceError{Message: "Unable to add ActivityTaskCanceled event to history."} + } + + activityStartedTime = ai.StartedTime + taskList = ai.TaskList + return nil + }) + if err == nil && !activityStartedTime.IsZero() { + scope := e.metricsClient.Scope(metrics.HistoryClientRespondActivityTaskCanceledScope). + Tagged( + metrics.DomainTag(domainName), + metrics.WorkflowTypeTag(token.WorkflowType), + metrics.ActivityTypeTag(token.ActivityType), + metrics.TaskListTag(taskList), + ) + scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) + } + return err +} diff --git a/service/history/engine/engineimpl/respond_activity_task_completed.go b/service/history/engine/engineimpl/respond_activity_task_completed.go new file mode 100644 index 00000000000..b5b6a251f11 --- /dev/null +++ b/service/history/engine/engineimpl/respond_activity_task_completed.go @@ -0,0 +1,125 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "fmt" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RespondActivityTaskCompleted completes an activity task. +func (e *historyEngineImpl) RespondActivityTaskCompleted( + ctx context.Context, + req *types.HistoryRespondActivityTaskCompletedRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(req.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + domainName := domainEntry.GetInfo().Name + + request := req.CompleteRequest + token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) + if err0 != nil { + return workflow.ErrDeserializingToken + } + + workflowExecution := types.WorkflowExecution{ + WorkflowID: token.WorkflowID, + RunID: token.RunID, + } + + var activityStartedTime time.Time + var taskList string + err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, true, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + return workflow.ErrAlreadyCompleted + } + + scheduleID := token.ScheduleID + if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID + scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) + if err0 != nil { + return err0 + } + } + ai, isRunning := mutableState.GetActivityInfo(scheduleID) + + // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in + // some extreme cassandra failure cases. + if !isRunning && scheduleID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskCompletedScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordActivityTaskCompleted", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + + if !isRunning || ai.StartedID == common.EmptyEventID || + (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { + e.logger.Warn(fmt.Sprintf( + "Encounter non existing activity in RecordActivityTaskCompleted: isRunning: %t, ai: %#v, token: %#v.", + isRunning, ai, token), + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrActivityTaskNotFound + } + + if _, err := mutableState.AddActivityTaskCompletedEvent(scheduleID, ai.StartedID, request); err != nil { + // Unable to add ActivityTaskCompleted event to history + return &types.InternalServiceError{Message: "Unable to add ActivityTaskCompleted event to history."} + } + activityStartedTime = ai.StartedTime + taskList = ai.TaskList + return nil + }) + if err == nil && !activityStartedTime.IsZero() { + scope := e.metricsClient.Scope(metrics.HistoryRespondActivityTaskCompletedScope). + Tagged( + metrics.DomainTag(domainName), + metrics.WorkflowTypeTag(token.WorkflowType), + metrics.ActivityTypeTag(token.ActivityType), + metrics.TaskListTag(taskList), + ) + scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) + } + return err +} diff --git a/service/history/engine/engineimpl/respond_activity_task_failed.go b/service/history/engine/engineimpl/respond_activity_task_failed.go new file mode 100644 index 00000000000..eadde2f0f0d --- /dev/null +++ b/service/history/engine/engineimpl/respond_activity_task_failed.go @@ -0,0 +1,141 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "fmt" + "time" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RespondActivityTaskFailed completes an activity task failure. +func (e *historyEngineImpl) RespondActivityTaskFailed( + ctx context.Context, + req *types.HistoryRespondActivityTaskFailedRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(req.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + domainName := domainEntry.GetInfo().Name + + request := req.FailedRequest + token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) + if err0 != nil { + return workflow.ErrDeserializingToken + } + + workflowExecution := types.WorkflowExecution{ + WorkflowID: token.WorkflowID, + RunID: token.RunID, + } + + var activityStartedTime time.Time + var taskList string + err = workflow.UpdateWithActionFunc( + ctx, + e.executionCache, + domainID, + workflowExecution, + e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { + if !mutableState.IsWorkflowExecutionRunning() { + return nil, workflow.ErrAlreadyCompleted + } + + scheduleID := token.ScheduleID + if scheduleID == common.EmptyEventID { // client call CompleteActivityById, so get scheduleID by activityID + scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) + if err0 != nil { + return nil, err0 + } + } + ai, isRunning := mutableState.GetActivityInfo(scheduleID) + + // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in + // some extreme cassandra failure cases. + if !isRunning && scheduleID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRespondActivityTaskFailedScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordActivityTaskFailed", + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return nil, workflow.ErrStaleState + } + + if !isRunning || ai.StartedID == common.EmptyEventID || + (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { + e.logger.Warn(fmt.Sprintf( + "Encounter non existing activity in RecordActivityTaskFailed: isRunning: %t, ai: %#v, token: %#v.", + isRunning, ai, token), + tag.WorkflowDomainName(domainName), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return nil, workflow.ErrActivityTaskNotFound + } + + postActions := &workflow.UpdateAction{} + ok, err := mutableState.RetryActivity(ai, req.FailedRequest.GetReason(), req.FailedRequest.GetDetails()) + if err != nil { + return nil, err + } + if !ok { + // no more retry, and we want to record the failure event + if _, err := mutableState.AddActivityTaskFailedEvent(scheduleID, ai.StartedID, request); err != nil { + // Unable to add ActivityTaskFailed event to history + return nil, &types.InternalServiceError{Message: "Unable to add ActivityTaskFailed event to history."} + } + postActions.CreateDecision = true + } + + activityStartedTime = ai.StartedTime + taskList = ai.TaskList + return postActions, nil + }, + ) + if err == nil && !activityStartedTime.IsZero() { + scope := e.metricsClient.Scope(metrics.HistoryRespondActivityTaskFailedScope). + Tagged( + metrics.DomainTag(domainName), + metrics.WorkflowTypeTag(token.WorkflowType), + metrics.ActivityTypeTag(token.ActivityType), + metrics.TaskListTag(taskList), + ) + scope.RecordTimer(metrics.ActivityE2ELatency, time.Since(activityStartedTime)) + } + return err +} diff --git a/service/history/engine/engineimpl/respond_activity_task_heartbeat.go b/service/history/engine/engineimpl/respond_activity_task_heartbeat.go new file mode 100644 index 00000000000..1cf84789a64 --- /dev/null +++ b/service/history/engine/engineimpl/respond_activity_task_heartbeat.go @@ -0,0 +1,124 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "fmt" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// RecordActivityTaskHeartbeat records an heartbeat for a task. +// This method can be used for two purposes. +// - For reporting liveness of the activity. +// - For reporting progress of the activity, this can be done even if the liveness is not configured. +func (e *historyEngineImpl) RecordActivityTaskHeartbeat( + ctx context.Context, + req *types.HistoryRecordActivityTaskHeartbeatRequest, +) (*types.RecordActivityTaskHeartbeatResponse, error) { + + domainEntry, err := e.getActiveDomainByID(req.DomainUUID) + if err != nil { + return nil, err + } + domainID := domainEntry.GetInfo().ID + + request := req.HeartbeatRequest + token, err0 := e.tokenSerializer.Deserialize(request.TaskToken) + if err0 != nil { + return nil, workflow.ErrDeserializingToken + } + + workflowExecution := types.WorkflowExecution{ + WorkflowID: token.WorkflowID, + RunID: token.RunID, + } + + var cancelRequested bool + err = workflow.UpdateWithAction(ctx, e.executionCache, domainID, workflowExecution, false, e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) error { + if !mutableState.IsWorkflowExecutionRunning() { + e.logger.Debug("Heartbeat failed") + return workflow.ErrAlreadyCompleted + } + + scheduleID := token.ScheduleID + if scheduleID == common.EmptyEventID { // client call RecordActivityHeartbeatByID, so get scheduleID by activityID + scheduleID, err0 = getScheduleID(token.ActivityID, mutableState) + if err0 != nil { + return err0 + } + } + ai, isRunning := mutableState.GetActivityInfo(scheduleID) + + // First check to see if cache needs to be refreshed as we could potentially have stale workflow execution in + // some extreme cassandra failure cases. + if !isRunning && scheduleID >= mutableState.GetNextEventID() { + e.metricsClient.IncCounter(metrics.HistoryRecordActivityTaskHeartbeatScope, metrics.StaleMutableStateCounter) + e.logger.Error("Encounter stale mutable state in RecordActivityTaskHeartbeat", + tag.WorkflowDomainName(domainEntry.GetInfo().Name), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + return workflow.ErrStaleState + } + + if !isRunning || ai.StartedID == common.EmptyEventID || + (token.ScheduleID != common.EmptyEventID && token.ScheduleAttempt != int64(ai.Attempt)) { + e.logger.Warn(fmt.Sprintf( + "Encounter non existing activity in RecordActivityTaskHeartbeat: isRunning: %t, ai: %#v, token: %#v.", + isRunning, ai, token), + tag.WorkflowDomainName(domainEntry.GetInfo().Name), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowScheduleID(scheduleID), + tag.WorkflowNextEventID(mutableState.GetNextEventID()), + ) + + return workflow.ErrActivityTaskNotFound + } + + cancelRequested = ai.CancelRequested + + e.logger.Debug(fmt.Sprintf("Activity HeartBeat: scheduleEventID: %v, ActivityInfo: %+v, CancelRequested: %v", + scheduleID, ai, cancelRequested)) + + // Save progress and last HB reported time. + mutableState.UpdateActivityProgress(ai, request) + + return nil + }) + + if err != nil { + return &types.RecordActivityTaskHeartbeatResponse{}, err + } + + return &types.RecordActivityTaskHeartbeatResponse{CancelRequested: cancelRequested}, nil +} diff --git a/service/history/engine/engineimpl/respond_decision_task_completed.go b/service/history/engine/engineimpl/respond_decision_task_completed.go new file mode 100644 index 00000000000..a311008da95 --- /dev/null +++ b/service/history/engine/engineimpl/respond_decision_task_completed.go @@ -0,0 +1,33 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" +) + +// RespondDecisionTaskCompleted completes a decision task +func (e *historyEngineImpl) RespondDecisionTaskCompleted(ctx context.Context, req *types.HistoryRespondDecisionTaskCompletedRequest) (*types.HistoryRespondDecisionTaskCompletedResponse, error) { + return e.decisionHandler.HandleDecisionTaskCompleted(ctx, req) +} diff --git a/service/history/engine/engineimpl/respond_decision_task_failed.go b/service/history/engine/engineimpl/respond_decision_task_failed.go new file mode 100644 index 00000000000..73f05a40a4a --- /dev/null +++ b/service/history/engine/engineimpl/respond_decision_task_failed.go @@ -0,0 +1,33 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" +) + +// RespondDecisionTaskFailed fails a decision +func (e *historyEngineImpl) RespondDecisionTaskFailed(ctx context.Context, req *types.HistoryRespondDecisionTaskFailedRequest) error { + return e.decisionHandler.HandleDecisionTaskFailed(ctx, req) +} diff --git a/service/history/engine/engineimpl/signal_workflow_execution.go b/service/history/engine/engineimpl/signal_workflow_execution.go new file mode 100644 index 00000000000..f7363ee7727 --- /dev/null +++ b/service/history/engine/engineimpl/signal_workflow_execution.go @@ -0,0 +1,130 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +func (e *historyEngineImpl) SignalWorkflowExecution( + ctx context.Context, + signalRequest *types.HistorySignalWorkflowExecutionRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(signalRequest.DomainUUID) + if err != nil { + return err + } + if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { + return errDomainDeprecated + } + domainID := domainEntry.GetInfo().ID + request := signalRequest.SignalRequest + parentExecution := signalRequest.ExternalWorkflowExecution + childWorkflowOnly := signalRequest.GetChildWorkflowOnly() + workflowExecution := types.WorkflowExecution{ + WorkflowID: request.WorkflowExecution.WorkflowID, + RunID: request.WorkflowExecution.RunID, + } + + return workflow.UpdateCurrentWithActionFunc( + ctx, + e.executionCache, + e.executionManager, + domainID, + e.shard.GetDomainCache(), + workflowExecution, + e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { + // first deduplicate by request id for signal decision + // this is done before workflow running check so that already completed error + // won't be returned for duplicated signals even if the workflow is closed. + if requestID := request.GetRequestID(); requestID != "" { + if mutableState.IsSignalRequested(requestID) { + return &workflow.UpdateAction{ + Noop: true, + CreateDecision: false, + }, nil + } + } + + if !mutableState.IsWorkflowExecutionRunning() { + return nil, workflow.ErrAlreadyCompleted + } + + // If history is corrupted, signal will be rejected + if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { + return nil, err + } else if corrupted { + return nil, &types.EntityNotExistsError{Message: "Workflow execution corrupted."} + } + + executionInfo := mutableState.GetExecutionInfo() + createDecisionTask := true + // Do not create decision task when the workflow is cron and the cron has not been started yet + if mutableState.GetExecutionInfo().CronSchedule != "" && !mutableState.HasProcessedOrPendingDecision() { + createDecisionTask = false + } + + maxAllowedSignals := e.config.MaximumSignalsPerExecution(domainEntry.GetInfo().Name) + if maxAllowedSignals > 0 && int(executionInfo.SignalCount) >= maxAllowedSignals { + e.logger.Info("Execution limit reached for maximum signals", tag.WorkflowSignalCount(executionInfo.SignalCount), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowDomainID(domainID)) + return nil, workflow.ErrSignalsLimitExceeded + } + + if childWorkflowOnly { + parentWorkflowID := executionInfo.ParentWorkflowID + parentRunID := executionInfo.ParentRunID + if parentExecution.GetWorkflowID() != parentWorkflowID || + parentExecution.GetRunID() != parentRunID { + return nil, workflow.ErrParentMismatch + } + } + + if requestID := request.GetRequestID(); requestID != "" { + mutableState.AddSignalRequested(requestID) + } + + if _, err := mutableState.AddWorkflowExecutionSignaled( + request.GetSignalName(), + request.GetInput(), + request.GetIdentity(), + request.GetRequestID(), + ); err != nil { + return nil, &types.InternalServiceError{Message: "Unable to signal workflow execution."} + } + + return &workflow.UpdateAction{ + Noop: false, + CreateDecision: createDecisionTask, + }, nil + }) +} diff --git a/service/history/engine/engineimpl/start_workflow_execution.go b/service/history/engine/engineimpl/start_workflow_execution.go new file mode 100644 index 00000000000..06e58209fe9 --- /dev/null +++ b/service/history/engine/engineimpl/start_workflow_execution.go @@ -0,0 +1,847 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + "fmt" + "time" + + "github.com/pborman/uuid" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/cache" + "github.com/uber/cadence/common/log/tag" + "github.com/uber/cadence/common/metrics" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +// for startWorkflowHelper be reused by signalWithStart +type signalWithStartArg struct { + signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest + prevMutableState execution.MutableState +} + +// StartWorkflowExecution starts a workflow execution +func (e *historyEngineImpl) StartWorkflowExecution( + ctx context.Context, + startRequest *types.HistoryStartWorkflowExecutionRequest, +) (resp *types.StartWorkflowExecutionResponse, retError error) { + + domainEntry, err := e.getActiveDomainByID(startRequest.DomainUUID) + if err != nil { + return nil, err + } + + return e.startWorkflowHelper( + ctx, + startRequest, + domainEntry, + metrics.HistoryStartWorkflowExecutionScope, + nil) +} + +func (e *historyEngineImpl) startWorkflowHelper( + ctx context.Context, + startRequest *types.HistoryStartWorkflowExecutionRequest, + domainEntry *cache.DomainCacheEntry, + metricsScope int, + signalWithStartArg *signalWithStartArg, +) (resp *types.StartWorkflowExecutionResponse, retError error) { + + if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { + return nil, errDomainDeprecated + } + + request := startRequest.StartRequest + err := e.validateStartWorkflowExecutionRequest(request, metricsScope) + if err != nil { + return nil, err + } + e.overrideStartWorkflowExecutionRequest(domainEntry, request, metricsScope) + + workflowID := request.GetWorkflowID() + domainID := domainEntry.GetInfo().ID + domain := domainEntry.GetInfo().Name + + // grab the current context as a lock, nothing more + // use a smaller context timeout to get the lock + childCtx, childCancel := e.newChildContext(ctx) + defer childCancel() + + _, currentRelease, err := e.executionCache.GetOrCreateCurrentWorkflowExecution( + childCtx, + domainID, + workflowID, + ) + if err != nil { + if err == context.DeadlineExceeded { + return nil, workflow.ErrConcurrentStartRequest + } + return nil, err + } + defer func() { currentRelease(retError) }() + + workflowExecution := types.WorkflowExecution{ + WorkflowID: workflowID, + RunID: uuid.New(), + } + curMutableState, err := e.createMutableState(domainEntry, workflowExecution.GetRunID()) + if err != nil { + return nil, err + } + + // preprocess for signalWithStart + var prevMutableState execution.MutableState + var signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest + isSignalWithStart := signalWithStartArg != nil + if isSignalWithStart { + prevMutableState = signalWithStartArg.prevMutableState + signalWithStartRequest = signalWithStartArg.signalWithStartRequest + } + if prevMutableState != nil { + prevLastWriteVersion, err := prevMutableState.GetLastWriteVersion() + if err != nil { + return nil, err + } + if prevLastWriteVersion > curMutableState.GetCurrentVersion() { + return nil, e.newDomainNotActiveError( + domainEntry.GetInfo().Name, + prevLastWriteVersion, + ) + } + err = e.applyWorkflowIDReusePolicyForSigWithStart( + prevMutableState.GetExecutionInfo(), + workflowExecution, + request.GetWorkflowIDReusePolicy(), + ) + if err != nil { + return nil, err + } + } else if e.shard.GetConfig().EnableRecordWorkflowExecutionUninitialized(domainEntry.GetInfo().Name) && e.visibilityMgr != nil { + uninitializedRequest := &persistence.RecordWorkflowExecutionUninitializedRequest{ + DomainUUID: domainID, + Domain: domain, + Execution: types.WorkflowExecution{ + WorkflowID: workflowID, + RunID: workflowExecution.RunID, + }, + WorkflowTypeName: request.WorkflowType.Name, + UpdateTimestamp: e.shard.GetTimeSource().Now().UnixNano(), + ShardID: int64(e.shard.GetShardID()), + } + + if err := e.visibilityMgr.RecordWorkflowExecutionUninitialized(ctx, uninitializedRequest); err != nil { + e.logger.Error("Failed to record uninitialized workflow execution", tag.Error(err)) + } + } + + err = e.addStartEventsAndTasks( + curMutableState, + workflowExecution, + startRequest, + signalWithStartRequest, + ) + if err != nil { + if e.shard.GetConfig().EnableRecordWorkflowExecutionUninitialized(domainEntry.GetInfo().Name) && e.visibilityMgr != nil { + // delete the uninitialized workflow execution record since it failed to start the workflow + // uninitialized record is used to find wfs that didn't make a progress or stuck during the start process + if errVisibility := e.visibilityMgr.DeleteWorkflowExecution(ctx, &persistence.VisibilityDeleteWorkflowExecutionRequest{ + DomainID: domainID, + Domain: domain, + RunID: workflowExecution.RunID, + WorkflowID: workflowID, + }); errVisibility != nil { + e.logger.Error("Failed to delete uninitialized workflow execution record", tag.Error(errVisibility)) + } + } + + return nil, err + } + wfContext := execution.NewContext(domainID, workflowExecution, e.shard, e.executionManager, e.logger) + + newWorkflow, newWorkflowEventsSeq, err := curMutableState.CloseTransactionAsSnapshot( + e.timeSource.Now(), + execution.TransactionPolicyActive, + ) + if err != nil { + return nil, err + } + historyBlob, err := wfContext.PersistStartWorkflowBatchEvents(ctx, newWorkflowEventsSeq[0]) + if err != nil { + return nil, err + } + + // create as brand new + createMode := persistence.CreateWorkflowModeBrandNew + prevRunID := "" + prevLastWriteVersion := int64(0) + // overwrite in case of signalWithStart + if prevMutableState != nil { + createMode = persistence.CreateWorkflowModeWorkflowIDReuse + info := prevMutableState.GetExecutionInfo() + // For corrupted workflows use ContinueAsNew mode. + // WorkflowIDReuse mode require workflows to be in completed state, which is not necessarily true for corrupted workflows. + if info.State == persistence.WorkflowStateCorrupted { + createMode = persistence.CreateWorkflowModeContinueAsNew + } + prevRunID = info.RunID + prevLastWriteVersion, err = prevMutableState.GetLastWriteVersion() + if err != nil { + return nil, err + } + } + err = wfContext.CreateWorkflowExecution( + ctx, + newWorkflow, + historyBlob, + createMode, + prevRunID, + prevLastWriteVersion, + persistence.CreateWorkflowRequestModeNew, + ) + if t, ok := persistence.AsDuplicateRequestError(err); ok { + if t.RequestType == persistence.WorkflowRequestTypeStart || (isSignalWithStart && t.RequestType == persistence.WorkflowRequestTypeSignal) { + return &types.StartWorkflowExecutionResponse{ + RunID: t.RunID, + }, nil + } + e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) + return nil, t + } + // handle already started error + if t, ok := err.(*persistence.WorkflowExecutionAlreadyStartedError); ok { + + if t.StartRequestID == request.GetRequestID() { + return &types.StartWorkflowExecutionResponse{ + RunID: t.RunID, + }, nil + } + + if isSignalWithStart { + return nil, err + } + + if curMutableState.GetCurrentVersion() < t.LastWriteVersion { + return nil, e.newDomainNotActiveError( + domainEntry.GetInfo().Name, + t.LastWriteVersion, + ) + } + + prevRunID = t.RunID + if shouldTerminateAndStart(startRequest, t.State) { + runningWFCtx, err := workflow.LoadOnce(ctx, e.executionCache, domainID, workflowID, prevRunID) + if err != nil { + return nil, err + } + defer func() { runningWFCtx.GetReleaseFn()(retError) }() + + resp, err = e.terminateAndStartWorkflow( + ctx, + runningWFCtx, + workflowExecution, + domainEntry, + domainID, + startRequest, + nil, + ) + switch err.(type) { + // By the time we try to terminate the workflow, it was already terminated + // So continue as if we didn't need to terminate it in the first place + case *types.WorkflowExecutionAlreadyCompletedError: + e.shard.GetLogger().Warn("Workflow completed while trying to terminate, will continue starting workflow", tag.Error(err)) + default: + return resp, err + } + } + if err = e.applyWorkflowIDReusePolicyHelper( + t.StartRequestID, + prevRunID, + t.State, + t.CloseStatus, + workflowExecution, + startRequest.StartRequest.GetWorkflowIDReusePolicy(), + ); err != nil { + return nil, err + } + // create as ID reuse + createMode = persistence.CreateWorkflowModeWorkflowIDReuse + err = wfContext.CreateWorkflowExecution( + ctx, + newWorkflow, + historyBlob, + createMode, + prevRunID, + t.LastWriteVersion, + persistence.CreateWorkflowRequestModeNew, + ) + if t, ok := persistence.AsDuplicateRequestError(err); ok { + if t.RequestType == persistence.WorkflowRequestTypeStart || (isSignalWithStart && t.RequestType == persistence.WorkflowRequestTypeSignal) { + return &types.StartWorkflowExecutionResponse{ + RunID: t.RunID, + }, nil + } + e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) + return nil, t + } + } + if err != nil { + return nil, err + } + + return &types.StartWorkflowExecutionResponse{ + RunID: workflowExecution.RunID, + }, nil +} + +func (e *historyEngineImpl) SignalWithStartWorkflowExecution( + ctx context.Context, + signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, +) (retResp *types.StartWorkflowExecutionResponse, retError error) { + + domainEntry, err := e.getActiveDomainByID(signalWithStartRequest.DomainUUID) + if err != nil { + return nil, err + } + if domainEntry.GetInfo().Status != persistence.DomainStatusRegistered { + return nil, errDomainDeprecated + } + domainID := domainEntry.GetInfo().ID + + sRequest := signalWithStartRequest.SignalWithStartRequest + workflowExecution := types.WorkflowExecution{ + WorkflowID: sRequest.WorkflowID, + } + + var prevMutableState execution.MutableState + attempt := 0 + + wfContext, release, err0 := e.executionCache.GetOrCreateWorkflowExecution(ctx, domainID, workflowExecution) + + if err0 == nil { + defer func() { release(retError) }() + Just_Signal_Loop: + for ; attempt < workflow.ConditionalRetryCount; attempt++ { + // workflow not exist, will create workflow then signal + mutableState, err1 := wfContext.LoadWorkflowExecution(ctx) + if err1 != nil { + if _, ok := err1.(*types.EntityNotExistsError); ok { + break + } + return nil, err1 + } + + if mutableState.IsSignalRequested(sRequest.GetRequestID()) { + return &types.StartWorkflowExecutionResponse{RunID: wfContext.GetExecution().RunID}, nil + } + + // workflow exist but not running, will restart workflow then signal + if !mutableState.IsWorkflowExecutionRunning() { + prevMutableState = mutableState + break + } + + // workflow exists but history is corrupted, will restart workflow then signal + if corrupted, err := e.checkForHistoryCorruptions(ctx, mutableState); err != nil { + return nil, err + } else if corrupted { + prevMutableState = mutableState + break + } + + // workflow is running, if policy is TerminateIfRunning, terminate current run then signalWithStart + if sRequest.GetWorkflowIDReusePolicy() == types.WorkflowIDReusePolicyTerminateIfRunning { + workflowExecution.RunID = uuid.New() + runningWFCtx := workflow.NewContext(wfContext, release, mutableState) + resp, errTerm := e.terminateAndStartWorkflow( + ctx, + runningWFCtx, + workflowExecution, + domainEntry, + domainID, + nil, + signalWithStartRequest, + ) + // By the time we try to terminate the workflow, it was already terminated + // So continue as if we didn't need to terminate it in the first place + if _, ok := errTerm.(*types.WorkflowExecutionAlreadyCompletedError); !ok { + return resp, errTerm + } + } + + executionInfo := mutableState.GetExecutionInfo() + maxAllowedSignals := e.config.MaximumSignalsPerExecution(domainEntry.GetInfo().Name) + if maxAllowedSignals > 0 && int(executionInfo.SignalCount) >= maxAllowedSignals { + e.logger.Info("Execution limit reached for maximum signals", tag.WorkflowSignalCount(executionInfo.SignalCount), + tag.WorkflowID(workflowExecution.GetWorkflowID()), + tag.WorkflowRunID(workflowExecution.GetRunID()), + tag.WorkflowDomainID(domainID)) + return nil, workflow.ErrSignalsLimitExceeded + } + + requestID := sRequest.GetRequestID() + if requestID != "" { + mutableState.AddSignalRequested(requestID) + } + + if _, err := mutableState.AddWorkflowExecutionSignaled( + sRequest.GetSignalName(), + sRequest.GetSignalInput(), + sRequest.GetIdentity(), + sRequest.GetRequestID(), + ); err != nil { + return nil, &types.InternalServiceError{Message: "Unable to signal workflow execution."} + } + + // Create a transfer task to schedule a decision task + if !mutableState.HasPendingDecision() { + _, err := mutableState.AddDecisionTaskScheduledEvent(false) + if err != nil { + return nil, &types.InternalServiceError{Message: "Failed to add decision scheduled event."} + } + } + + // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload + // the history and try the operation again. + if err := wfContext.UpdateWorkflowExecutionAsActive(ctx, e.shard.GetTimeSource().Now()); err != nil { + if t, ok := persistence.AsDuplicateRequestError(err); ok { + if t.RequestType == persistence.WorkflowRequestTypeSignal { + return &types.StartWorkflowExecutionResponse{RunID: t.RunID}, nil + } + e.logger.Error("A bug is detected for idempotency improvement", tag.Dynamic("request-type", t.RequestType)) + return nil, t + } + if execution.IsConflictError(err) { + continue Just_Signal_Loop + } + return nil, err + } + return &types.StartWorkflowExecutionResponse{RunID: wfContext.GetExecution().RunID}, nil + } // end for Just_Signal_Loop + if attempt == workflow.ConditionalRetryCount { + return nil, workflow.ErrMaxAttemptsExceeded + } + } else { + if _, ok := err0.(*types.EntityNotExistsError); !ok { + return nil, err0 + } + // workflow not exist, will create workflow then signal + } + + // Start workflow and signal + startRequest, err := getStartRequest(domainID, sRequest, signalWithStartRequest.PartitionConfig) + if err != nil { + return nil, err + } + + sigWithStartArg := &signalWithStartArg{ + signalWithStartRequest: signalWithStartRequest, + prevMutableState: prevMutableState, + } + return e.startWorkflowHelper( + ctx, + startRequest, + domainEntry, + metrics.HistorySignalWithStartWorkflowExecutionScope, + sigWithStartArg, + ) +} + +func getStartRequest( + domainID string, + request *types.SignalWithStartWorkflowExecutionRequest, + partitionConfig map[string]string, +) (*types.HistoryStartWorkflowExecutionRequest, error) { + + req := &types.StartWorkflowExecutionRequest{ + Domain: request.Domain, + WorkflowID: request.WorkflowID, + WorkflowType: request.WorkflowType, + TaskList: request.TaskList, + Input: request.Input, + ExecutionStartToCloseTimeoutSeconds: request.ExecutionStartToCloseTimeoutSeconds, + TaskStartToCloseTimeoutSeconds: request.TaskStartToCloseTimeoutSeconds, + Identity: request.Identity, + RequestID: request.RequestID, + WorkflowIDReusePolicy: request.WorkflowIDReusePolicy, + RetryPolicy: request.RetryPolicy, + CronSchedule: request.CronSchedule, + Memo: request.Memo, + SearchAttributes: request.SearchAttributes, + Header: request.Header, + DelayStartSeconds: request.DelayStartSeconds, + JitterStartSeconds: request.JitterStartSeconds, + } + + return common.CreateHistoryStartWorkflowRequest(domainID, req, time.Now(), partitionConfig) +} + +func shouldTerminateAndStart(startRequest *types.HistoryStartWorkflowExecutionRequest, state int) bool { + return startRequest.StartRequest.GetWorkflowIDReusePolicy() == types.WorkflowIDReusePolicyTerminateIfRunning && + (state == persistence.WorkflowStateRunning || state == persistence.WorkflowStateCreated) +} + +func (e *historyEngineImpl) validateStartWorkflowExecutionRequest(request *types.StartWorkflowExecutionRequest, metricsScope int) error { + if len(request.GetRequestID()) == 0 { + return &types.BadRequestError{Message: "Missing request ID."} + } + if request.ExecutionStartToCloseTimeoutSeconds == nil || request.GetExecutionStartToCloseTimeoutSeconds() <= 0 { + return &types.BadRequestError{Message: "Missing or invalid ExecutionStartToCloseTimeoutSeconds."} + } + if request.TaskStartToCloseTimeoutSeconds == nil || request.GetTaskStartToCloseTimeoutSeconds() <= 0 { + return &types.BadRequestError{Message: "Missing or invalid TaskStartToCloseTimeoutSeconds."} + } + if request.TaskList == nil || request.TaskList.GetName() == "" { + return &types.BadRequestError{Message: "Missing Tasklist."} + } + if request.WorkflowType == nil || request.WorkflowType.GetName() == "" { + return &types.BadRequestError{Message: "Missing WorkflowType."} + } + + if !common.IsValidIDLength( + request.GetDomain(), + e.metricsClient.Scope(metricsScope), + e.config.MaxIDLengthWarnLimit(), + e.config.DomainNameMaxLength(request.GetDomain()), + metrics.CadenceErrDomainNameExceededWarnLimit, + request.GetDomain(), + e.logger, + tag.IDTypeDomainName) { + return &types.BadRequestError{Message: "Domain exceeds length limit."} + } + + if !common.IsValidIDLength( + request.GetWorkflowID(), + e.metricsClient.Scope(metricsScope), + e.config.MaxIDLengthWarnLimit(), + e.config.WorkflowIDMaxLength(request.GetDomain()), + metrics.CadenceErrWorkflowIDExceededWarnLimit, + request.GetDomain(), + e.logger, + tag.IDTypeWorkflowID) { + return &types.BadRequestError{Message: "WorkflowId exceeds length limit."} + } + if !common.IsValidIDLength( + request.TaskList.GetName(), + e.metricsClient.Scope(metricsScope), + e.config.MaxIDLengthWarnLimit(), + e.config.TaskListNameMaxLength(request.GetDomain()), + metrics.CadenceErrTaskListNameExceededWarnLimit, + request.GetDomain(), + e.logger, + tag.IDTypeTaskListName) { + return &types.BadRequestError{Message: "TaskList exceeds length limit."} + } + if !common.IsValidIDLength( + request.WorkflowType.GetName(), + e.metricsClient.Scope(metricsScope), + e.config.MaxIDLengthWarnLimit(), + e.config.WorkflowTypeMaxLength(request.GetDomain()), + metrics.CadenceErrWorkflowTypeExceededWarnLimit, + request.GetDomain(), + e.logger, + tag.IDTypeWorkflowType) { + return &types.BadRequestError{Message: "WorkflowType exceeds length limit."} + } + + return common.ValidateRetryPolicy(request.RetryPolicy) +} + +func (e *historyEngineImpl) overrideStartWorkflowExecutionRequest( + domainEntry *cache.DomainCacheEntry, + request *types.StartWorkflowExecutionRequest, + metricsScope int, +) { + domainName := domainEntry.GetInfo().Name + maxDecisionStartToCloseTimeoutSeconds := int32(e.config.MaxDecisionStartToCloseSeconds(domainName)) + + taskStartToCloseTimeoutSecs := request.GetTaskStartToCloseTimeoutSeconds() + taskStartToCloseTimeoutSecs = common.MinInt32(taskStartToCloseTimeoutSecs, maxDecisionStartToCloseTimeoutSeconds) + taskStartToCloseTimeoutSecs = common.MinInt32(taskStartToCloseTimeoutSecs, request.GetExecutionStartToCloseTimeoutSeconds()) + + if taskStartToCloseTimeoutSecs != request.GetTaskStartToCloseTimeoutSeconds() { + request.TaskStartToCloseTimeoutSeconds = &taskStartToCloseTimeoutSecs + e.metricsClient.Scope( + metricsScope, + metrics.DomainTag(domainName), + ).IncCounter(metrics.DecisionStartToCloseTimeoutOverrideCount) + } +} + +// terminate running workflow then start a new run in one transaction +func (e *historyEngineImpl) terminateAndStartWorkflow( + ctx context.Context, + runningWFCtx workflow.Context, + workflowExecution types.WorkflowExecution, + domainEntry *cache.DomainCacheEntry, + domainID string, + startRequest *types.HistoryStartWorkflowExecutionRequest, + signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, +) (*types.StartWorkflowExecutionResponse, error) { + runningMutableState := runningWFCtx.GetMutableState() +UpdateWorkflowLoop: + for attempt := 0; attempt < workflow.ConditionalRetryCount; attempt++ { + if !runningMutableState.IsWorkflowExecutionRunning() { + return nil, workflow.ErrAlreadyCompleted + } + + if err := execution.TerminateWorkflow( + runningMutableState, + runningMutableState.GetNextEventID(), + TerminateIfRunningReason, + getTerminateIfRunningDetails(workflowExecution.GetRunID()), + execution.IdentityHistoryService, + ); err != nil { + if err == workflow.ErrStaleState { + // Handler detected that cached workflow mutable could potentially be stale + // Reload workflow execution history + runningWFCtx.GetContext().Clear() + if attempt != workflow.ConditionalRetryCount-1 { + _, err = runningWFCtx.ReloadMutableState(ctx) + if err != nil { + return nil, err + } + } + continue UpdateWorkflowLoop + } + return nil, err + } + + // new mutable state + newMutableState, err := e.createMutableState(domainEntry, workflowExecution.GetRunID()) + if err != nil { + return nil, err + } + + if signalWithStartRequest != nil { + startRequest, err = getStartRequest(domainID, signalWithStartRequest.SignalWithStartRequest, signalWithStartRequest.PartitionConfig) + if err != nil { + return nil, err + } + } + + err = e.addStartEventsAndTasks( + newMutableState, + workflowExecution, + startRequest, + signalWithStartRequest, + ) + if err != nil { + return nil, err + } + + updateErr := runningWFCtx.GetContext().UpdateWorkflowExecutionWithNewAsActive( + ctx, + e.timeSource.Now(), + execution.NewContext( + domainID, + workflowExecution, + e.shard, + e.shard.GetExecutionManager(), + e.logger, + ), + newMutableState, + ) + if updateErr != nil { + if execution.IsConflictError(updateErr) { + e.metricsClient.IncCounter(metrics.HistoryStartWorkflowExecutionScope, metrics.ConcurrencyUpdateFailureCounter) + continue UpdateWorkflowLoop + } + return nil, updateErr + } + break UpdateWorkflowLoop + } + return &types.StartWorkflowExecutionResponse{ + RunID: workflowExecution.RunID, + }, nil +} + +func (e *historyEngineImpl) addStartEventsAndTasks( + mutableState execution.MutableState, + workflowExecution types.WorkflowExecution, + startRequest *types.HistoryStartWorkflowExecutionRequest, + signalWithStartRequest *types.HistorySignalWithStartWorkflowExecutionRequest, +) error { + // Add WF start event + startEvent, err := mutableState.AddWorkflowExecutionStartedEvent( + workflowExecution, + startRequest, + ) + if err != nil { + return &types.InternalServiceError{ + Message: "Failed to add workflow execution started event.", + } + } + + if signalWithStartRequest != nil { + // Add signal event + sRequest := signalWithStartRequest.SignalWithStartRequest + if sRequest.GetRequestID() != "" { + mutableState.AddSignalRequested(sRequest.GetRequestID()) + } + _, err := mutableState.AddWorkflowExecutionSignaled( + sRequest.GetSignalName(), + sRequest.GetSignalInput(), + sRequest.GetIdentity(), + sRequest.GetRequestID(), + ) + if err != nil { + return &types.InternalServiceError{Message: "Failed to add workflow execution signaled event."} + } + } + + // Generate first decision task event if not child WF and no first decision task backoff + return e.generateFirstDecisionTask( + mutableState, + startRequest.ParentExecutionInfo, + startEvent, + ) +} + +func getTerminateIfRunningDetails(newRunID string) []byte { + return []byte(fmt.Sprintf(TerminateIfRunningDetailsTemplate, newRunID)) +} + +func (e *historyEngineImpl) applyWorkflowIDReusePolicyForSigWithStart( + prevExecutionInfo *persistence.WorkflowExecutionInfo, + execution types.WorkflowExecution, + wfIDReusePolicy types.WorkflowIDReusePolicy, +) error { + + prevStartRequestID := prevExecutionInfo.CreateRequestID + prevRunID := prevExecutionInfo.RunID + prevState := prevExecutionInfo.State + prevCloseState := prevExecutionInfo.CloseStatus + + return e.applyWorkflowIDReusePolicyHelper( + prevStartRequestID, + prevRunID, + prevState, + prevCloseState, + execution, + wfIDReusePolicy, + ) +} + +func (e *historyEngineImpl) applyWorkflowIDReusePolicyHelper( + prevStartRequestID, + prevRunID string, + prevState int, + prevCloseState int, + execution types.WorkflowExecution, + wfIDReusePolicy types.WorkflowIDReusePolicy, +) error { + + // here we know some information about the prev workflow, i.e. either running right now + // or has history check if the workflow is finished + switch prevState { + case persistence.WorkflowStateCreated, + persistence.WorkflowStateRunning: + msg := "Workflow execution is already running. WorkflowId: %v, RunId: %v." + return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) + case persistence.WorkflowStateCompleted: + // previous workflow completed, proceed + case persistence.WorkflowStateCorrupted: + // ignore workflow ID reuse policy for corrupted workflows, treat as they do not exist + return nil + default: + // persistence.WorkflowStateZombie or unknown type + return &types.InternalServiceError{Message: fmt.Sprintf("Failed to process workflow, workflow has invalid state: %v.", prevState)} + } + + switch wfIDReusePolicy { + case types.WorkflowIDReusePolicyAllowDuplicateFailedOnly: + if _, ok := FailedWorkflowCloseState[prevCloseState]; !ok { + msg := "Workflow execution already finished successfully. WorkflowId: %v, RunId: %v. Workflow ID reuse policy: allow duplicate workflow ID if last run failed." + return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) + } + case types.WorkflowIDReusePolicyAllowDuplicate, + types.WorkflowIDReusePolicyTerminateIfRunning: + // no check need here + case types.WorkflowIDReusePolicyRejectDuplicate: + msg := "Workflow execution already finished. WorkflowId: %v, RunId: %v. Workflow ID reuse policy: reject duplicate workflow ID." + return getWorkflowAlreadyStartedError(msg, prevStartRequestID, execution.GetWorkflowID(), prevRunID) + default: + return &types.InternalServiceError{Message: "Failed to process start workflow reuse policy."} + } + + return nil +} + +func getWorkflowAlreadyStartedError(errMsg string, createRequestID string, workflowID string, runID string) error { + return &types.WorkflowExecutionAlreadyStartedError{ + Message: fmt.Sprintf(errMsg, workflowID, runID), + StartRequestID: createRequestID, + RunID: runID, + } +} + +func (e *historyEngineImpl) newChildContext( + parentCtx context.Context, +) (context.Context, context.CancelFunc) { + + ctxTimeout := contextLockTimeout + if deadline, ok := parentCtx.Deadline(); ok { + now := e.shard.GetTimeSource().Now() + parentTimeout := deadline.Sub(now) + if parentTimeout > 0 && parentTimeout < contextLockTimeout { + ctxTimeout = parentTimeout + } + } + return context.WithTimeout(context.Background(), ctxTimeout) +} + +func (e *historyEngineImpl) createMutableState(domainEntry *cache.DomainCacheEntry, runID string) (execution.MutableState, error) { + + newMutableState := execution.NewMutableStateBuilderWithVersionHistories( + e.shard, + e.logger, + domainEntry, + ) + + if err := newMutableState.SetHistoryTree(runID); err != nil { + return nil, err + } + + return newMutableState, nil +} + +func (e *historyEngineImpl) generateFirstDecisionTask( + mutableState execution.MutableState, + parentInfo *types.ParentExecutionInfo, + startEvent *types.HistoryEvent, +) error { + + if parentInfo == nil { + // DecisionTask is only created when it is not a Child Workflow and no backoff is needed + if err := mutableState.AddFirstDecisionTaskScheduled( + startEvent, + ); err != nil { + return err + } + } + return nil +} diff --git a/service/history/engine/engineimpl/terminate_workflow_execution.go b/service/history/engine/engineimpl/terminate_workflow_execution.go new file mode 100644 index 00000000000..73fd6d2782f --- /dev/null +++ b/service/history/engine/engineimpl/terminate_workflow_execution.go @@ -0,0 +1,102 @@ +// Copyright (c) 2017-2021 Uber Technologies, Inc. +// Portions of the Software are attributed to Copyright (c) 2021 Temporal Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package engineimpl + +import ( + "context" + + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/execution" + "github.com/uber/cadence/service/history/workflow" +) + +func (e *historyEngineImpl) TerminateWorkflowExecution( + ctx context.Context, + terminateRequest *types.HistoryTerminateWorkflowExecutionRequest, +) error { + + domainEntry, err := e.getActiveDomainByID(terminateRequest.DomainUUID) + if err != nil { + return err + } + domainID := domainEntry.GetInfo().ID + + request := terminateRequest.TerminateRequest + parentExecution := terminateRequest.ExternalWorkflowExecution + childWorkflowOnly := terminateRequest.GetChildWorkflowOnly() + workflowExecution := types.WorkflowExecution{ + WorkflowID: request.WorkflowExecution.WorkflowID, + } + // If firstExecutionRunID is set on the request always try to cancel currently running execution + if request.GetFirstExecutionRunID() == "" { + workflowExecution.RunID = request.WorkflowExecution.RunID + } + + return workflow.UpdateCurrentWithActionFunc( + ctx, + e.executionCache, + e.executionManager, + domainID, + e.shard.GetDomainCache(), + workflowExecution, + e.timeSource.Now(), + func(wfContext execution.Context, mutableState execution.MutableState) (*workflow.UpdateAction, error) { + if !mutableState.IsWorkflowExecutionRunning() { + return nil, workflow.ErrAlreadyCompleted + } + + executionInfo := mutableState.GetExecutionInfo() + if request.GetFirstExecutionRunID() != "" { + firstRunID := executionInfo.FirstExecutionRunID + if firstRunID == "" { + // This is needed for backwards compatibility. Workflow execution create with Cadence release v0.25.0 or earlier + // does not have FirstExecutionRunID stored as part of mutable state. If this is not set then load it from + // workflow execution started event. + startEvent, err := mutableState.GetStartEvent(ctx) + if err != nil { + return nil, err + } + firstRunID = startEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstExecutionRunID() + } + if request.GetFirstExecutionRunID() != firstRunID { + return nil, &types.EntityNotExistsError{Message: "Workflow execution not found"} + } + } + if childWorkflowOnly { + parentWorkflowID := executionInfo.ParentWorkflowID + parentRunID := executionInfo.ParentRunID + if parentExecution.GetWorkflowID() != parentWorkflowID || + parentExecution.GetRunID() != parentRunID { + return nil, workflow.ErrParentMismatch + } + } + + eventBatchFirstEventID := mutableState.GetNextEventID() + return workflow.UpdateWithoutDecision, execution.TerminateWorkflow( + mutableState, + eventBatchFirstEventID, + request.GetReason(), + request.GetDetails(), + request.GetIdentity(), + ) + }) +} From dedc023fa774f99827dd261ae289c1bbdede4cf0 Mon Sep 17 00:00:00 2001 From: Tim Li <47233368+timl3136@users.noreply.github.com> Date: Tue, 7 May 2024 13:11:48 -0700 Subject: [PATCH 05/15] Added unit tests for service/history/handler (#5970) * Added unit tests for history handler --- service/history/handler/handler_test.go | 436 +++++++++++++++++++++++- 1 file changed, 433 insertions(+), 3 deletions(-) diff --git a/service/history/handler/handler_test.go b/service/history/handler/handler_test.go index 9c2c8336ef7..250dd7842d9 100644 --- a/service/history/handler/handler_test.go +++ b/service/history/handler/handler_test.go @@ -764,6 +764,436 @@ func (s *handlerSuite) TestRespondActivityTaskFailed() { } } +func (s *handlerSuite) TestRespondActivityTaskCanceled() { + validInput := &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + } + testInput := map[string]struct { + input *types.HistoryRespondActivityTaskCanceledRequest + expectedError bool + mockFn func() + }{ + "valid input": { + input: validInput, + expectedError: false, + mockFn: func() { + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondActivityTaskCanceled(gomock.Any(), validInput).Return(nil).Times(1) + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + }, + }, + "empty domainID": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: "", + }, + expectedError: true, + mockFn: func() {}, + }, + "ratelimit exceeded": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(false).Times(1) + }, + }, + "token deserialization error": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(nil, errors.New("some random error")).Times(1) + }, + }, + "invalid task token": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: "", + RunID: "", + }, nil).Times(1) + }, + }, + "get engine error": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(nil, errors.New("error")).Times(1) + }, + }, + "engine error": { + input: &types.HistoryRespondActivityTaskCanceledRequest{ + DomainUUID: testDomainID, + CancelRequest: &types.RespondActivityTaskCanceledRequest{ + TaskToken: []byte("task-token"), + Details: []byte("Details"), + Identity: "identity", + }, + }, + expectedError: true, + mockFn: func() { + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondActivityTaskCanceled(gomock.Any(), validInput).Return(errors.New("error")).Times(1) + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + }, + }, + } + + for name, input := range testInput { + s.Run(name, func() { + input.mockFn() + err := s.handler.RespondActivityTaskCanceled(context.Background(), input.input) + if input.expectedError { + s.Error(err) + } else { + s.NoError(err) + } + }) + } +} + +func (s *handlerSuite) TestRespondDecisionTaskCompleted() { + validReq := &types.HistoryRespondDecisionTaskCompletedRequest{ + DomainUUID: testDomainID, + CompleteRequest: &types.RespondDecisionTaskCompletedRequest{ + TaskToken: []byte("task-token"), + Decisions: []*types.Decision{ + { + DecisionType: types.DecisionTypeScheduleActivityTask.Ptr(), + }, + }, + ExecutionContext: nil, + Identity: "identity", + }, + } + validResp := &types.HistoryRespondDecisionTaskCompletedResponse{ + StartedResponse: &types.RecordDecisionTaskStartedResponse{ + WorkflowType: &types.WorkflowType{}, + }, + } + testInput := map[string]struct { + input *types.HistoryRespondDecisionTaskCompletedRequest + expectedError bool + mockFn func() + }{ + "valid input": { + input: validReq, + expectedError: false, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskCompleted(gomock.Any(), validReq).Return(validResp, nil).Times(1) + }, + }, + "empty domainID": { + input: &types.HistoryRespondDecisionTaskCompletedRequest{ + DomainUUID: "", + }, + expectedError: true, + mockFn: func() {}, + }, + "ratelimit exceeded": { + input: validReq, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(false).Times(1) + }, + }, + "token deserialization error": { + input: &types.HistoryRespondDecisionTaskCompletedRequest{ + DomainUUID: testDomainID, + CompleteRequest: &types.RespondDecisionTaskCompletedRequest{ + TaskToken: []byte("task-token"), + Decisions: []*types.Decision{}, + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(nil, errors.New("some random error")).Times(1) + }, + }, + "invalid task token": { + input: &types.HistoryRespondDecisionTaskCompletedRequest{ + DomainUUID: testDomainID, + CompleteRequest: &types.RespondDecisionTaskCompletedRequest{ + TaskToken: []byte("task-token"), + Decisions: []*types.Decision{}, + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: "", + RunID: "", + }, nil).Times(1) + }, + }, + "get engine error": { + input: &types.HistoryRespondDecisionTaskCompletedRequest{ + DomainUUID: testDomainID, + CompleteRequest: &types.RespondDecisionTaskCompletedRequest{ + TaskToken: []byte("task-token"), + Decisions: []*types.Decision{}, + }, + }, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(nil, errors.New("error")).Times(1) + }, + }, + "engine error": { + input: validReq, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskCompleted(gomock.Any(), validReq).Return(nil, errors.New("error")).Times(1) + }, + }, + } + + for name, input := range testInput { + s.Run(name, func() { + input.mockFn() + resp, err := s.handler.RespondDecisionTaskCompleted(context.Background(), input.input) + if input.expectedError { + s.Nil(resp) + s.Error(err) + } else { + s.NotNil(resp) + s.NoError(err) + } + }) + + } +} + +func (s *handlerSuite) TestRespondDecisionTaskFailed() { + validInput := &types.HistoryRespondDecisionTaskFailedRequest{ + DomainUUID: testDomainID, + FailedRequest: &types.RespondDecisionTaskFailedRequest{ + TaskToken: []byte("task-token"), + Cause: types.DecisionTaskFailedCauseBadBinary.Ptr(), + Details: []byte("Details"), + Identity: "identity", + }, + } + specialInput := &types.HistoryRespondDecisionTaskFailedRequest{ + DomainUUID: testDomainID, + FailedRequest: &types.RespondDecisionTaskFailedRequest{ + TaskToken: []byte("task-token"), + Cause: types.DecisionTaskFailedCauseUnhandledDecision.Ptr(), + Details: []byte("Details"), + Identity: "identity", + }, + } + testInput := map[string]struct { + input *types.HistoryRespondDecisionTaskFailedRequest + expectedError bool + mockFn func() + }{ + "valid input": { + input: validInput, + expectedError: false, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskFailed(gomock.Any(), validInput).Return(nil).Times(1) + }, + }, + "empty domainID": { + input: &types.HistoryRespondDecisionTaskFailedRequest{ + DomainUUID: "", + }, + expectedError: true, + mockFn: func() {}, + }, + "ratelimit exceeded": { + input: validInput, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(false).Times(1) + }, + }, + "token deserialization error": { + input: validInput, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(nil, errors.New("some random error")).Times(1) + }, + }, + "invalid task token": { + input: validInput, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: "", + RunID: "", + }, nil).Times(1) + }, + }, + "get engine error": { + input: validInput, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(nil, errors.New("error")).Times(1) + }, + }, + "engine error": { + input: validInput, + expectedError: true, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskFailed(gomock.Any(), validInput).Return(errors.New("error")).Times(1) + }, + }, + "special domain": { + input: specialInput, + expectedError: false, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockResource.DomainCache.EXPECT().GetDomainName(gomock.Any()).Return("name", nil).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskFailed(gomock.Any(), specialInput).Return(nil).Times(1) + }, + }, + "special domain2": { + input: specialInput, + expectedError: false, + mockFn: func() { + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockTokenSerializer.EXPECT().Deserialize(gomock.Any()).Return(&common.TaskToken{ + WorkflowID: testWorkflowID, + RunID: testValidUUID, + }, nil).Times(1) + s.mockResource.DomainCache.EXPECT().GetDomainName(gomock.Any()).Return("", errors.New("error")).Times(1) + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockEngine.EXPECT().RespondDecisionTaskFailed(gomock.Any(), specialInput).Return(nil).Times(1) + }, + }, + } + + for name, input := range testInput { + s.Run(name, func() { + input.mockFn() + err := s.handler.RespondDecisionTaskFailed(context.Background(), input.input) + if input.expectedError { + s.Error(err) + } else { + s.NoError(err) + } + }) + } +} + +func (s *handlerSuite) TestDescribeHistoryHost() { + request := &types.DescribeHistoryHostRequest{ + HostAddress: common.StringPtr("test"), + } + + mockStatus := map[string]int32{ + "initialized": 0, + "started": 1, + "stopped": 2, + } + + for status, value := range mockStatus { + s.mockResource.DomainCache.EXPECT().GetCacheSize().Return(int64(2), int64(3)).Times(1) + s.mockShardController.EXPECT().Status().Return(value).Times(1) + s.mockShardController.EXPECT().NumShards().Return(1) + s.mockShardController.EXPECT().ShardIDs().Return([]int32{0}) + resp, err := s.handler.DescribeHistoryHost(context.Background(), request) + s.NoError(err) + s.Equal(resp.DomainCache, &types.DomainCacheInfo{ + NumOfItemsInCacheByID: 2, + NumOfItemsInCacheByName: 3, + }) + s.Equal(resp.ShardControllerStatus, status) + } +} + func (s *handlerSuite) TestGetCrossClusterTasks() { numShards := 10 targetCluster := cluster.TestAlternativeClusterName @@ -880,9 +1310,9 @@ func (s *handlerSuite) TestStartWorkflowExecution() { RunID: testWorkflowRunID, } - s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).AnyTimes() - s.mockRatelimiter.EXPECT().Allow().Return(true).AnyTimes() - s.mockEngine.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).Return(expectedResponse, nil).AnyTimes() + s.mockShardController.EXPECT().GetEngine(testWorkflowID).Return(s.mockEngine, nil).Times(1) + s.mockRatelimiter.EXPECT().Allow().Return(true).Times(1) + s.mockEngine.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any()).Return(expectedResponse, nil).Times(1) response, err := s.handler.StartWorkflowExecution(context.Background(), request) s.Equal(expectedResponse, response) From b28766eb737c610437b7aa7ddea7835b0d6a7f81 Mon Sep 17 00:00:00 2001 From: Zijian Date: Tue, 7 May 2024 13:47:51 -0700 Subject: [PATCH 06/15] Add unit tests for mutable state task refresher (#5971) --- .../execution/mutable_state_task_refresher.go | 177 ++--- .../mutable_state_task_refresher_test.go | 686 ++++++++++++++++++ 2 files changed, 778 insertions(+), 85 deletions(-) create mode 100644 service/history/execution/mutable_state_task_refresher_test.go diff --git a/service/history/execution/mutable_state_task_refresher.go b/service/history/execution/mutable_state_task_refresher.go index 365c041cada..34b815f5a1e 100644 --- a/service/history/execution/mutable_state_task_refresher.go +++ b/service/history/execution/mutable_state_task_refresher.go @@ -49,6 +49,18 @@ type ( domainCache cache.DomainCache eventsCache events.Cache shardID int + + newMutableStateTaskGeneratorFn func(cluster.Metadata, cache.DomainCache, MutableState) MutableStateTaskGenerator + refreshTasksForWorkflowStartFn func(context.Context, time.Time, MutableState, MutableStateTaskGenerator) error + refreshTasksForWorkflowCloseFn func(context.Context, MutableState, MutableStateTaskGenerator, int) error + refreshTasksForRecordWorkflowStartedFn func(context.Context, MutableState, MutableStateTaskGenerator) error + refreshTasksForDecisionFn func(context.Context, MutableState, MutableStateTaskGenerator) error + refreshTasksForActivityFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache, func(MutableState) TimerSequence) error + refreshTasksForTimerFn func(context.Context, MutableState, MutableStateTaskGenerator, func(MutableState) TimerSequence) error + refreshTasksForChildWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForRequestCancelExternalWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForSignalExternalWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForWorkflowSearchAttrFn func(context.Context, MutableState, MutableStateTaskGenerator) error } ) @@ -60,13 +72,24 @@ func NewMutableStateTaskRefresher( eventsCache events.Cache, shardID int, ) MutableStateTaskRefresher { - return &mutableStateTaskRefresherImpl{ config: config, clusterMetadata: clusterMetadata, domainCache: domainCache, eventsCache: eventsCache, shardID: shardID, + + newMutableStateTaskGeneratorFn: NewMutableStateTaskGenerator, + refreshTasksForWorkflowStartFn: refreshTasksForWorkflowStart, + refreshTasksForWorkflowCloseFn: refreshTasksForWorkflowClose, + refreshTasksForRecordWorkflowStartedFn: refreshTasksForRecordWorkflowStarted, + refreshTasksForDecisionFn: refreshTasksForDecision, + refreshTasksForActivityFn: refreshTasksForActivity, + refreshTasksForTimerFn: refreshTasksForTimer, + refreshTasksForChildWorkflowFn: refreshTasksForChildWorkflow, + refreshTasksForRequestCancelExternalWorkflowFn: refreshTasksForRequestCancelExternalWorkflow, + refreshTasksForSignalExternalWorkflowFn: refreshTasksForSignalExternalWorkflow, + refreshTasksForWorkflowSearchAttrFn: refreshTasksForWorkflowSearchAttr, } } @@ -75,14 +98,13 @@ func (r *mutableStateTaskRefresherImpl) RefreshTasks( startTime time.Time, mutableState MutableState, ) error { - - taskGenerator := NewMutableStateTaskGenerator( + taskGenerator := r.newMutableStateTaskGeneratorFn( r.clusterMetadata, r.domainCache, mutableState, ) - if err := r.refreshTasksForWorkflowStart( + if err := r.refreshTasksForWorkflowStartFn( ctx, startTime, mutableState, @@ -91,15 +113,16 @@ func (r *mutableStateTaskRefresherImpl) RefreshTasks( return err } - if err := r.refreshTasksForWorkflowClose( + if err := r.refreshTasksForWorkflowCloseFn( ctx, mutableState, taskGenerator, + r.config.WorkflowDeletionJitterRange(mutableState.GetDomainEntry().GetInfo().Name), ); err != nil { return err } - if err := r.refreshTasksForRecordWorkflowStarted( + if err := r.refreshTasksForRecordWorkflowStartedFn( ctx, mutableState, taskGenerator, @@ -107,7 +130,7 @@ func (r *mutableStateTaskRefresherImpl) RefreshTasks( return err } - if err := r.refreshTasksForDecision( + if err := r.refreshTasksForDecisionFn( ctx, mutableState, taskGenerator, @@ -115,48 +138,58 @@ func (r *mutableStateTaskRefresherImpl) RefreshTasks( return err } - if err := r.refreshTasksForActivity( + if err := r.refreshTasksForActivityFn( ctx, mutableState, taskGenerator, + r.shardID, + r.eventsCache, + NewTimerSequence, ); err != nil { return err } - if err := r.refreshTasksForTimer( + if err := r.refreshTasksForTimerFn( ctx, mutableState, taskGenerator, + NewTimerSequence, ); err != nil { return err } - if err := r.refreshTasksForChildWorkflow( + if err := r.refreshTasksForChildWorkflowFn( ctx, mutableState, taskGenerator, + r.shardID, + r.eventsCache, ); err != nil { return err } - if err := r.refreshTasksForRequestCancelExternalWorkflow( + if err := r.refreshTasksForRequestCancelExternalWorkflowFn( ctx, mutableState, taskGenerator, + r.shardID, + r.eventsCache, ); err != nil { return err } - if err := r.refreshTasksForSignalExternalWorkflow( + if err := r.refreshTasksForSignalExternalWorkflowFn( ctx, mutableState, taskGenerator, + r.shardID, + r.eventsCache, ); err != nil { return err } if common.IsAdvancedVisibilityWritingEnabled(r.config.AdvancedVisibilityWritingMode(), r.config.IsAdvancedVisConfigExist) { - if err := r.refreshTasksForWorkflowSearchAttr( + if err := r.refreshTasksForWorkflowSearchAttrFn( ctx, mutableState, taskGenerator, @@ -168,13 +201,12 @@ func (r *mutableStateTaskRefresherImpl) RefreshTasks( return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForWorkflowStart( +func refreshTasksForWorkflowStart( ctx context.Context, startTime time.Time, mutableState MutableState, taskGenerator MutableStateTaskGenerator, ) error { - startEvent, err := mutableState.GetStartEvent(ctx) if err != nil { return err @@ -199,12 +231,12 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForWorkflowStart( return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForWorkflowClose( +func refreshTasksForWorkflowClose( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + workflowDeletionTaskJitterRange int, ) error { - executionInfo := mutableState.GetExecutionInfo() if executionInfo.CloseStatus != persistence.WorkflowCloseStatusNone { closeEvent, err := mutableState.GetCompletionEvent(ctx) @@ -213,41 +245,35 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForWorkflowClose( } return taskGenerator.GenerateWorkflowCloseTasks( closeEvent, - r.config.WorkflowDeletionJitterRange(mutableState.GetDomainEntry().GetInfo().Name), + workflowDeletionTaskJitterRange, ) } - return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForRecordWorkflowStarted( +func refreshTasksForRecordWorkflowStarted( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, ) error { - - startEvent, err := mutableState.GetStartEvent(ctx) - if err != nil { - return err - } - executionInfo := mutableState.GetExecutionInfo() - if executionInfo.CloseStatus == persistence.WorkflowCloseStatusNone { + startEvent, err := mutableState.GetStartEvent(ctx) + if err != nil { + return err + } return taskGenerator.GenerateRecordWorkflowStartedTasks( startEvent, ) } - return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForDecision( +func refreshTasksForDecision( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, ) error { - if !mutableState.HasPendingDecision() { // no decision task at all return nil @@ -271,39 +297,35 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForDecision( ) } -func (r *mutableStateTaskRefresherImpl) refreshTasksForActivity( +func refreshTasksForActivity( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + shardID int, + eventsCache events.Cache, + newTimerSequenceFn func(MutableState) TimerSequence, ) error { - - executionInfo := mutableState.GetExecutionInfo() - pendingActivityInfos := mutableState.GetPendingActivityInfos() - currentBranchToken, err := mutableState.GetCurrentBranchToken() if err != nil { return err } - -Loop: + executionInfo := mutableState.GetExecutionInfo() + pendingActivityInfos := mutableState.GetPendingActivityInfos() for _, activityInfo := range pendingActivityInfos { // clear all activity timer task mask for later activity timer task re-generation activityInfo.TimerTaskStatus = TimerTaskStatusNone - // need to update activity timer task mask for which task is generated if err := mutableState.UpdateActivity( activityInfo, ); err != nil { return err } - if activityInfo.StartedID != common.EmptyEventID { - continue Loop + continue } - - scheduleEvent, err := r.eventsCache.GetEvent( + scheduleEvent, err := eventsCache.GetEvent( ctx, - r.shardID, + shardID, executionInfo.DomainID, executionInfo.WorkflowID, executionInfo.RunID, @@ -314,7 +336,6 @@ Loop: if err != nil { return err } - if err := taskGenerator.GenerateActivityTransferTasks( scheduleEvent, ); err != nil { @@ -322,7 +343,7 @@ Loop: } } - if _, err := NewTimerSequence( + if _, err := newTimerSequenceFn( mutableState, ).CreateNextActivityTimer(); err != nil { return err @@ -331,12 +352,12 @@ Loop: return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForTimer( +func refreshTasksForTimer( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + newTimerSequenceFn func(MutableState) TimerSequence, ) error { - pendingTimerInfos := mutableState.GetPendingTimerInfos() for _, timerInfo := range pendingTimerInfos { @@ -351,38 +372,33 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForTimer( } } - if _, err := NewTimerSequence( - mutableState, - ).CreateNextUserTimer(); err != nil { + if _, err := newTimerSequenceFn(mutableState).CreateNextUserTimer(); err != nil { return err } return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForChildWorkflow( +func refreshTasksForChildWorkflow( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + shardID int, + eventsCache events.Cache, ) error { - - executionInfo := mutableState.GetExecutionInfo() - pendingChildWorkflowInfos := mutableState.GetPendingChildExecutionInfos() - currentBranchToken, err := mutableState.GetCurrentBranchToken() if err != nil { return err } - -Loop: + executionInfo := mutableState.GetExecutionInfo() + pendingChildWorkflowInfos := mutableState.GetPendingChildExecutionInfos() for _, childWorkflowInfo := range pendingChildWorkflowInfos { if childWorkflowInfo.StartedID != common.EmptyEventID { - continue Loop + continue } - - scheduleEvent, err := r.eventsCache.GetEvent( + scheduleEvent, err := eventsCache.GetEvent( ctx, - r.shardID, + shardID, executionInfo.DomainID, executionInfo.WorkflowID, executionInfo.RunID, @@ -393,35 +409,32 @@ Loop: if err != nil { return err } - if err := taskGenerator.GenerateChildWorkflowTasks( scheduleEvent, ); err != nil { return err } } - return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForRequestCancelExternalWorkflow( +func refreshTasksForRequestCancelExternalWorkflow( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + shardID int, + eventsCache events.Cache, ) error { - - executionInfo := mutableState.GetExecutionInfo() - pendingRequestCancelInfos := mutableState.GetPendingRequestCancelExternalInfos() - currentBranchToken, err := mutableState.GetCurrentBranchToken() if err != nil { return err } - + executionInfo := mutableState.GetExecutionInfo() + pendingRequestCancelInfos := mutableState.GetPendingRequestCancelExternalInfos() for _, requestCancelInfo := range pendingRequestCancelInfos { - initiateEvent, err := r.eventsCache.GetEvent( + initiateEvent, err := eventsCache.GetEvent( ctx, - r.shardID, + shardID, executionInfo.DomainID, executionInfo.WorkflowID, executionInfo.RunID, @@ -432,35 +445,32 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForRequestCancelExternalWork if err != nil { return err } - if err := taskGenerator.GenerateRequestCancelExternalTasks( initiateEvent, ); err != nil { return err } } - return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForSignalExternalWorkflow( +func refreshTasksForSignalExternalWorkflow( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, + shardID int, + eventsCache events.Cache, ) error { - - executionInfo := mutableState.GetExecutionInfo() - pendingSignalInfos := mutableState.GetPendingSignalExternalInfos() - currentBranchToken, err := mutableState.GetCurrentBranchToken() if err != nil { return err } - + executionInfo := mutableState.GetExecutionInfo() + pendingSignalInfos := mutableState.GetPendingSignalExternalInfos() for _, signalInfo := range pendingSignalInfos { - initiateEvent, err := r.eventsCache.GetEvent( + initiateEvent, err := eventsCache.GetEvent( ctx, - r.shardID, + shardID, executionInfo.DomainID, executionInfo.WorkflowID, executionInfo.RunID, @@ -471,22 +481,19 @@ func (r *mutableStateTaskRefresherImpl) refreshTasksForSignalExternalWorkflow( if err != nil { return err } - if err := taskGenerator.GenerateSignalExternalTasks( initiateEvent, ); err != nil { return err } } - return nil } -func (r *mutableStateTaskRefresherImpl) refreshTasksForWorkflowSearchAttr( +func refreshTasksForWorkflowSearchAttr( ctx context.Context, mutableState MutableState, taskGenerator MutableStateTaskGenerator, ) error { - return taskGenerator.GenerateWorkflowSearchAttrTasks() } diff --git a/service/history/execution/mutable_state_task_refresher_test.go b/service/history/execution/mutable_state_task_refresher_test.go new file mode 100644 index 00000000000..935da98e8d8 --- /dev/null +++ b/service/history/execution/mutable_state_task_refresher_test.go @@ -0,0 +1,686 @@ +// The MIT License (MIT) + +// Copyright (c) 2017-2020 Uber Technologies Inc. + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package execution + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/golang/mock/gomock" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/cache" + "github.com/uber/cadence/common/cluster" + "github.com/uber/cadence/common/dynamicconfig" + "github.com/uber/cadence/common/persistence" + "github.com/uber/cadence/common/types" + "github.com/uber/cadence/service/history/config" + "github.com/uber/cadence/service/history/events" +) + +func TestRefreshTasksForWorkflowStart(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator) + wantErr bool + }{ + { + name: "failed to get start event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetStartEvent(gomock.Any()).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate start tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetStartEvent(gomock.Any()).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateWorkflowStartTasks(gomock.Any(), &types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate delayed decision tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + startEvent := &types.HistoryEvent{ + ID: 1, + WorkflowExecutionStartedEventAttributes: &types.WorkflowExecutionStartedEventAttributes{ + FirstDecisionTaskBackoffSeconds: common.Ptr[int32](10), + }, + } + ms.EXPECT().GetStartEvent(gomock.Any()).Return(startEvent, nil) + mtg.EXPECT().GenerateWorkflowStartTasks(gomock.Any(), gomock.Any()).Return(nil) + ms.EXPECT().HasProcessedOrPendingDecision().Return(false) + mtg.EXPECT().GenerateDelayedDecisionTasks(startEvent).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + startEvent := &types.HistoryEvent{ + ID: 1, + WorkflowExecutionStartedEventAttributes: &types.WorkflowExecutionStartedEventAttributes{ + FirstDecisionTaskBackoffSeconds: common.Ptr[int32](10), + }, + } + ms.EXPECT().GetStartEvent(gomock.Any()).Return(startEvent, nil) + mtg.EXPECT().GenerateWorkflowStartTasks(gomock.Any(), gomock.Any()).Return(nil) + ms.EXPECT().HasProcessedOrPendingDecision().Return(false) + mtg.EXPECT().GenerateDelayedDecisionTasks(startEvent).Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + tc.mockSetup(ms, mtg) + err := refreshTasksForWorkflowStart(context.Background(), time.Now(), ms, mtg) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForWorkflowStart err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForWorkflowClose(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator) + wantErr bool + }{ + { + name: "failed to get completion event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusCompleted}) + ms.EXPECT().GetCompletionEvent(gomock.Any()).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate close tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusCompleted}) + ms.EXPECT().GetCompletionEvent(gomock.Any()).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateWorkflowCloseTasks(&types.HistoryEvent{ID: 1}, gomock.Any()).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success - open workflow", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusNone}) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + tc.mockSetup(ms, mtg) + err := refreshTasksForWorkflowClose(context.Background(), ms, mtg, 100) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForWorkflowClose err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForRecordWorkflowStarted(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator) + wantErr bool + }{ + { + name: "failed to get start event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusNone}) + ms.EXPECT().GetStartEvent(gomock.Any()).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate record started tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetStartEvent(gomock.Any()).Return(&types.HistoryEvent{ID: 1}, nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusNone}) + mtg.EXPECT().GenerateRecordWorkflowStartedTasks(&types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success - closed workflow", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{CloseStatus: persistence.WorkflowCloseStatusCompleted}) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + tc.mockSetup(ms, mtg) + err := refreshTasksForRecordWorkflowStarted(context.Background(), ms, mtg) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForRecordWorkflowStarted err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForDecision(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator) + wantErr bool + }{ + { + name: "success - no pending decision", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().HasPendingDecision().Return(false) + }, + wantErr: false, + }, + { + name: "bug - cannot get pending decision", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().HasPendingDecision().Return(true) + ms.EXPECT().GetPendingDecision().Return(nil, false) + }, + wantErr: true, + }, + { + name: "success - generate decision started task", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().HasPendingDecision().Return(true) + ms.EXPECT().GetPendingDecision().Return(&DecisionInfo{ScheduleID: 2, StartedID: 3}, true) + mtg.EXPECT().GenerateDecisionStartTasks(int64(2)).Return(nil) + }, + wantErr: false, + }, + { + name: "success - generate decision started task", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator) { + ms.EXPECT().HasPendingDecision().Return(true) + ms.EXPECT().GetPendingDecision().Return(&DecisionInfo{ScheduleID: 2, StartedID: common.EmptyEventID}, true) + mtg.EXPECT().GenerateDecisionScheduleTasks(int64(2)).Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + tc.mockSetup(ms, mtg) + err := refreshTasksForDecision(context.Background(), ms, mtg) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForDecision err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForActivity(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator, *events.MockCache, *MockTimerSequence) + wantErr bool + }{ + { + name: "failed to get current branch token", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to update activity", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{}) + ms.EXPECT().GetPendingActivityInfos().Return(map[int64]*persistence.ActivityInfo{1: {Version: 1, TimerTaskStatus: TimerTaskStatusCreated}}) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 1, TimerTaskStatus: TimerTaskStatusNone}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to get event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingActivityInfos().Return(map[int64]*persistence.ActivityInfo{1: {Version: 1, TimerTaskStatus: TimerTaskStatusCreated, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}}) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 1, TimerTaskStatus: TimerTaskStatusNone, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}).Return(nil) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(11), int64(12), []byte("token")).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate activity tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingActivityInfos().Return(map[int64]*persistence.ActivityInfo{1: {Version: 1, TimerTaskStatus: TimerTaskStatusCreated, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}}) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 1, TimerTaskStatus: TimerTaskStatusNone, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}).Return(nil) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(11), int64(12), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateActivityTransferTasks(&types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to create activity timer", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingActivityInfos().Return(map[int64]*persistence.ActivityInfo{1: {Version: 1, TimerTaskStatus: TimerTaskStatusCreated, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}}) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 1, TimerTaskStatus: TimerTaskStatusNone, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}).Return(nil) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(11), int64(12), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateActivityTransferTasks(&types.HistoryEvent{ID: 1}).Return(nil) + mt.EXPECT().CreateNextActivityTimer().Return(false, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache, mt *MockTimerSequence) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingActivityInfos().Return(map[int64]*persistence.ActivityInfo{10: {Version: 0, StartedID: 10}, 1: {Version: 1, TimerTaskStatus: TimerTaskStatusCreated, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}}) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 0, StartedID: 10, TimerTaskStatus: TimerTaskStatusNone}).Return(nil) + ms.EXPECT().UpdateActivity(&persistence.ActivityInfo{Version: 1, TimerTaskStatus: TimerTaskStatusNone, ScheduledEventBatchID: 11, ScheduleID: 12, StartedID: common.EmptyEventID}).Return(nil) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(11), int64(12), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateActivityTransferTasks(&types.HistoryEvent{ID: 1}).Return(nil) + mt.EXPECT().CreateNextActivityTimer().Return(true, nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + mc := events.NewMockCache(ctrl) + mt := NewMockTimerSequence(ctrl) + tc.mockSetup(ms, mtg, mc, mt) + err := refreshTasksForActivity(context.Background(), ms, mtg, 1, mc, func(MutableState) TimerSequence { return mt }) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForActivity err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForTimer(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator, *MockTimerSequence) + wantErr bool + }{ + { + name: "failed to update user timer", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mt *MockTimerSequence) { + ms.EXPECT().GetPendingTimerInfos().Return(map[string]*persistence.TimerInfo{"0": &persistence.TimerInfo{}}) + ms.EXPECT().UpdateUserTimer(gomock.Any()).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to create user timer", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mt *MockTimerSequence) { + ms.EXPECT().GetPendingTimerInfos().Return(map[string]*persistence.TimerInfo{"0": &persistence.TimerInfo{}}) + ms.EXPECT().UpdateUserTimer(gomock.Any()).Return(nil) + mt.EXPECT().CreateNextUserTimer().Return(false, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mt *MockTimerSequence) { + ms.EXPECT().GetPendingTimerInfos().Return(map[string]*persistence.TimerInfo{"0": &persistence.TimerInfo{}}) + ms.EXPECT().UpdateUserTimer(gomock.Any()).Return(nil) + mt.EXPECT().CreateNextUserTimer().Return(false, nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + mt := NewMockTimerSequence(ctrl) + tc.mockSetup(ms, mtg, mt) + err := refreshTasksForTimer(context.Background(), ms, mtg, func(MutableState) TimerSequence { return mt }) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForTimer err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForChildWorkflow(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator, *events.MockCache) + wantErr bool + }{ + { + name: "failed to get current branch token", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to get event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingChildExecutionInfos().Return(map[int64]*persistence.ChildExecutionInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, StartedID: common.EmptyEventID, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate child workflow tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingChildExecutionInfos().Return(map[int64]*persistence.ChildExecutionInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, StartedID: common.EmptyEventID, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateChildWorkflowTasks(&types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingChildExecutionInfos().Return(map[int64]*persistence.ChildExecutionInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, StartedID: common.EmptyEventID, Version: 1}, 11: {StartedID: 12}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateChildWorkflowTasks(&types.HistoryEvent{ID: 1}).Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + mc := events.NewMockCache(ctrl) + tc.mockSetup(ms, mtg, mc) + err := refreshTasksForChildWorkflow(context.Background(), ms, mtg, 1, mc) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForChildWorkflow err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForRequestCancelExternalWorkflow(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator, *events.MockCache) + wantErr bool + }{ + { + name: "failed to get current branch token", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to get event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingRequestCancelExternalInfos().Return(map[int64]*persistence.RequestCancelInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate child workflow tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingRequestCancelExternalInfos().Return(map[int64]*persistence.RequestCancelInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateRequestCancelExternalTasks(&types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingRequestCancelExternalInfos().Return(map[int64]*persistence.RequestCancelInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateRequestCancelExternalTasks(&types.HistoryEvent{ID: 1}).Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + mc := events.NewMockCache(ctrl) + tc.mockSetup(ms, mtg, mc) + err := refreshTasksForRequestCancelExternalWorkflow(context.Background(), ms, mtg, 1, mc) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForRequestCancelExternalWorkflow err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForSignalExternalWorkflow(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableState, *MockMutableStateTaskGenerator, *events.MockCache) + wantErr bool + }{ + { + name: "failed to get current branch token", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to get event", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingSignalExternalInfos().Return(map[int64]*persistence.SignalInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(nil, errors.New("some error")) + }, + wantErr: true, + }, + { + name: "failed to generate child workflow tasks", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingSignalExternalInfos().Return(map[int64]*persistence.SignalInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateSignalExternalTasks(&types.HistoryEvent{ID: 1}).Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(ms *MockMutableState, mtg *MockMutableStateTaskGenerator, mc *events.MockCache) { + ms.EXPECT().GetCurrentBranchToken().Return([]byte("token"), nil) + ms.EXPECT().GetExecutionInfo().Return(&persistence.WorkflowExecutionInfo{DomainID: "domain-id", WorkflowID: "wf-id", RunID: "run-id"}) + ms.EXPECT().GetPendingSignalExternalInfos().Return(map[int64]*persistence.SignalInfo{1: {InitiatedEventBatchID: 1, InitiatedID: 2, Version: 1}}) + mc.EXPECT().GetEvent(gomock.Any(), gomock.Any(), "domain-id", "wf-id", "run-id", int64(1), int64(2), []byte("token")).Return(&types.HistoryEvent{ID: 1}, nil) + mtg.EXPECT().GenerateSignalExternalTasks(&types.HistoryEvent{ID: 1}).Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + mc := events.NewMockCache(ctrl) + tc.mockSetup(ms, mtg, mc) + err := refreshTasksForSignalExternalWorkflow(context.Background(), ms, mtg, 1, mc) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForSignalExternalWorkflow err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasksForWorkflowSearchAttr(t *testing.T) { + testCases := []struct { + name string + mockSetup func(*MockMutableStateTaskGenerator) + wantErr bool + }{ + { + name: "failed to generate workflow search attribute tasks", + mockSetup: func(mtg *MockMutableStateTaskGenerator) { + mtg.EXPECT().GenerateWorkflowSearchAttrTasks().Return(errors.New("some error")) + }, + wantErr: true, + }, + { + name: "success", + mockSetup: func(mtg *MockMutableStateTaskGenerator) { + mtg.EXPECT().GenerateWorkflowSearchAttrTasks().Return(nil) + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mtg := NewMockMutableStateTaskGenerator(ctrl) + tc.mockSetup(mtg) + err := refreshTasksForWorkflowSearchAttr(context.Background(), nil, mtg) + if (err != nil) != tc.wantErr { + t.Errorf("refreshTasksForWorkflowSearchAttr err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} + +func TestRefreshTasks(t *testing.T) { + testCases := []struct { + name string + refreshTasksForWorkflowStartFn func(context.Context, time.Time, MutableState, MutableStateTaskGenerator) error + refreshTasksForWorkflowCloseFn func(context.Context, MutableState, MutableStateTaskGenerator, int) error + refreshTasksForRecordWorkflowStartedFn func(context.Context, MutableState, MutableStateTaskGenerator) error + refreshTasksForDecisionFn func(context.Context, MutableState, MutableStateTaskGenerator) error + refreshTasksForActivityFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache, func(MutableState) TimerSequence) error + refreshTasksForTimerFn func(context.Context, MutableState, MutableStateTaskGenerator, func(MutableState) TimerSequence) error + refreshTasksForChildWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForRequestCancelExternalWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForSignalExternalWorkflowFn func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error + refreshTasksForWorkflowSearchAttrFn func(context.Context, MutableState, MutableStateTaskGenerator) error + wantErr bool + }{ + { + name: "success", + refreshTasksForWorkflowStartFn: func(context.Context, time.Time, MutableState, MutableStateTaskGenerator) error { return nil }, + refreshTasksForWorkflowCloseFn: func(context.Context, MutableState, MutableStateTaskGenerator, int) error { return nil }, + refreshTasksForRecordWorkflowStartedFn: func(context.Context, MutableState, MutableStateTaskGenerator) error { return nil }, + refreshTasksForDecisionFn: func(context.Context, MutableState, MutableStateTaskGenerator) error { return nil }, + refreshTasksForActivityFn: func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache, func(MutableState) TimerSequence) error { + return nil + }, + refreshTasksForTimerFn: func(context.Context, MutableState, MutableStateTaskGenerator, func(MutableState) TimerSequence) error { + return nil + }, + refreshTasksForChildWorkflowFn: func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error { return nil }, + refreshTasksForRequestCancelExternalWorkflowFn: func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error { return nil }, + refreshTasksForSignalExternalWorkflowFn: func(context.Context, MutableState, MutableStateTaskGenerator, int, events.Cache) error { return nil }, + refreshTasksForWorkflowSearchAttrFn: func(context.Context, MutableState, MutableStateTaskGenerator) error { return nil }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + ms := NewMockMutableState(ctrl) + mtg := NewMockMutableStateTaskGenerator(ctrl) + ms.EXPECT().GetDomainEntry().Return(cache.NewLocalDomainCacheEntryForTest(&persistence.DomainInfo{ID: "domain-id"}, nil, "test")).AnyTimes() + refresher := &mutableStateTaskRefresherImpl{ + config: &config.Config{ + AdvancedVisibilityWritingMode: dynamicconfig.GetStringPropertyFn(common.AdvancedVisibilityWritingModeOn), + WorkflowDeletionJitterRange: dynamicconfig.GetIntPropertyFilteredByDomain(1), + IsAdvancedVisConfigExist: true, + }, + newMutableStateTaskGeneratorFn: func(cluster.Metadata, cache.DomainCache, MutableState) MutableStateTaskGenerator { + return mtg + }, + refreshTasksForWorkflowStartFn: tc.refreshTasksForWorkflowStartFn, + refreshTasksForWorkflowCloseFn: tc.refreshTasksForWorkflowCloseFn, + refreshTasksForRecordWorkflowStartedFn: tc.refreshTasksForRecordWorkflowStartedFn, + refreshTasksForDecisionFn: tc.refreshTasksForDecisionFn, + refreshTasksForActivityFn: tc.refreshTasksForActivityFn, + refreshTasksForTimerFn: tc.refreshTasksForTimerFn, + refreshTasksForChildWorkflowFn: tc.refreshTasksForChildWorkflowFn, + refreshTasksForRequestCancelExternalWorkflowFn: tc.refreshTasksForRequestCancelExternalWorkflowFn, + refreshTasksForSignalExternalWorkflowFn: tc.refreshTasksForSignalExternalWorkflowFn, + refreshTasksForWorkflowSearchAttrFn: tc.refreshTasksForWorkflowSearchAttrFn, + } + err := refresher.RefreshTasks(context.Background(), time.Now(), ms) + if (err != nil) != tc.wantErr { + t.Errorf("RefreshTasks err = %v, wantErr %v", err, tc.wantErr) + } + }) + } +} From d45186dfb58e9e497454f3b18ecfa885df4840a6 Mon Sep 17 00:00:00 2001 From: taylanisikdemir Date: Tue, 7 May 2024 13:58:16 -0700 Subject: [PATCH 07/15] Revert codecov patch threshold to 85% (#5982) --- codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codecov.yml b/codecov.yml index 18bc0b883c1..ec0a45dae55 100644 --- a/codecov.yml +++ b/codecov.yml @@ -19,7 +19,7 @@ coverage: if_ci_failed: ignore # require the CI to pass before setting the status patch: default: - target: 0% # specify the target coverage for each commit status + target: 85% # specify the target coverage for each commit status # option: "auto" (compare against parent commit or pull request base) # option: "X%" a static target percentage to hit threshold: 0% # allow the coverage drop by x% before marking as failure From c21a71a022d15ec9d6f07a83c042539650e1c5f9 Mon Sep 17 00:00:00 2001 From: Josue Alexander Ibarra <1480657+ibarrajo@users.noreply.github.com> Date: Tue, 7 May 2024 14:10:58 -0700 Subject: [PATCH 08/15] Api handler test respond activity task failed alternate (#5980) * TestRespondActivityTaskFailed_Success implementation * Alternate test for TerstRespondActivityTaskFailed * code lint --- service/frontend/api/handler_test.go | 116 ++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 4 deletions(-) diff --git a/service/frontend/api/handler_test.go b/service/frontend/api/handler_test.go index 212146ca776..0e58e945785 100644 --- a/service/frontend/api/handler_test.go +++ b/service/frontend/api/handler_test.go @@ -24,6 +24,7 @@ import ( "context" "encoding/json" "errors" + "fmt" "testing" "time" @@ -56,8 +57,9 @@ import ( ) const ( - numHistoryShards = 10 - + numHistoryShards = 10 + testDomain = "test-domain" + testDomainID = "e4f90ec0-1313-45be-9877-8aa41f72a45a" testWorkflowID = "test-workflow-id" testRunID = "2c8b555f-1f55-4955-9d1c-b980194555c9" testHistoryArchivalURI = "testScheme://history/URI" @@ -105,8 +107,8 @@ func (s *workflowHandlerSuite) TearDownSuite() { func (s *workflowHandlerSuite) SetupTest() { s.Assertions = require.New(s.T()) - s.testDomain = "test-domain" - s.testDomainID = "e4f90ec0-1313-45be-9877-8aa41f72a45a" + s.testDomain = testDomain + s.testDomainID = testDomainID s.controller = gomock.NewController(s.T()) s.mockResource = resource.NewTest(s.T(), s.controller, metrics.Frontend) @@ -722,6 +724,112 @@ func (s *workflowHandlerSuite) TestRespondActivityTaskCompletedByID_Success() { s.NoError(err) } +func buildRespondActivityTaskFailedRequest(taskToken common.TaskToken) *types.RespondActivityTaskFailedRequest { + serializer := common.NewJSONTaskTokenSerializer() + taskTokenBytes, err := serializer.Serialize(&taskToken) + if err != nil { + panic(err) + } + return &types.RespondActivityTaskFailedRequest{ + TaskToken: taskTokenBytes, + } +} + +func TestRespondActivityTaskFailed(t *testing.T) { + failedRequest := buildRespondActivityTaskFailedRequest(common.TaskToken{ + DomainID: testDomainID, + WorkflowID: testWorkflowID, + RunID: testRunID, + ActivityID: "1", + }) + + type fields struct { + shuttingDown int32 + } + + type args struct { + ctx context.Context + failedRequest *types.RespondActivityTaskFailedRequest + } + + tests := []struct { + name string + fields fields + setupMocks func(*resource.Test, *client.VersionCheckerMock) + args args + wantErr assert.ErrorAssertionFunc + }{ + { + name: "Success", + fields: fields{ + shuttingDown: 0, + }, + setupMocks: func(t *resource.Test, mockVersionChecker *client.VersionCheckerMock) { + mockVersionChecker.EXPECT().ClientSupported(gomock.Any(), gomock.Any()).Return(nil) + + t.HistoryClient.EXPECT().RespondActivityTaskFailed(gomock.Any(), &types.HistoryRespondActivityTaskFailedRequest{ + DomainUUID: testDomainID, + FailedRequest: failedRequest, + }).Return(nil) + + t.DomainCache.EXPECT().GetDomainName(gomock.Any()).Return("test-domain-id", nil) + }, + args: args{ + context.Background(), + failedRequest, + }, + wantErr: assert.NoError, + }, + { + name: "Error when shutting down", + fields: fields{shuttingDown: 1}, + setupMocks: func(t *resource.Test, mockVersionChecker *client.VersionCheckerMock) { + + }, + args: args{ + context.Background(), + buildRespondActivityTaskFailedRequest(common.TaskToken{ + DomainID: testDomainID, + WorkflowID: testWorkflowID, + RunID: testRunID, + ActivityID: "1", + }), + }, + wantErr: assert.Error, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockCtrl := gomock.NewController(t) + mockResource := resource.NewTest(t, mockCtrl, metrics.Frontend) + mockVersionChecker := client.NewMockVersionChecker(mockCtrl) + + tt.setupMocks(mockResource, mockVersionChecker) + + mockProducerManager := NewMockProducerManager(mockCtrl) + + config := frontendcfg.NewConfig( + dc.NewCollection( + dc.NewInMemoryClient(), + mockResource.GetLogger(), + ), + numHistoryShards, + false, + "hostname", + ) + + wh := NewWorkflowHandler(mockResource, config, mockVersionChecker, nil) + wh.shuttingDown = tt.fields.shuttingDown + wh.producerManager = mockProducerManager + + tt.wantErr(t, wh.RespondActivityTaskFailed(tt.args.ctx, tt.args.failedRequest), + fmt.Sprintf("RespondActivityTaskFailed(%v, %v)", tt.args.ctx, tt.args.failedRequest)) + }) + } + +} + func (s *workflowHandlerSuite) TestRegisterDomain_Failure_MissingDomainDataKey() { dynamicClient := dc.NewInMemoryClient() err := dynamicClient.UpdateValue(dc.RequiredDomainDataKeys, map[string]interface{}{"Tier": true}) From 7eed1c1c818fcff1785d1fea626af716ab64494f Mon Sep 17 00:00:00 2001 From: Nate Mortensen Date: Tue, 7 May 2024 14:11:12 -0700 Subject: [PATCH 09/15] Move shardscanner workflow tests to the shardscanner package (#5981) This ensures they get included in the code coverage metrics. --- .../shardscanner/scanner_workflow_test.go | 12 +- .../shardscanner/shardscannertest/workflow.go | 50 -- .../shardscannertest/workflow_test.go | 478 ----------------- .../scanner/shardscanner/workflows_test.go | 489 ++++++++++++++++++ 4 files changed, 495 insertions(+), 534 deletions(-) delete mode 100644 service/worker/scanner/shardscanner/shardscannertest/workflow.go delete mode 100644 service/worker/scanner/shardscanner/shardscannertest/workflow_test.go create mode 100644 service/worker/scanner/shardscanner/workflows_test.go diff --git a/service/worker/scanner/shardscanner/scanner_workflow_test.go b/service/worker/scanner/shardscanner/scanner_workflow_test.go index 00174f89233..6599874594f 100644 --- a/service/worker/scanner/shardscanner/scanner_workflow_test.go +++ b/service/worker/scanner/shardscanner/scanner_workflow_test.go @@ -35,16 +35,16 @@ import ( "github.com/uber/cadence/common/reconciliation/invariant" ) -type workflowsSuite struct { +type scannerWorkflowsSuite struct { suite.Suite testsuite.WorkflowTestSuite } func TestScannerWorkflowSuite(t *testing.T) { - suite.Run(t, new(workflowsSuite)) + suite.Run(t, new(scannerWorkflowsSuite)) } -func (s *workflowsSuite) TestGetBatchIndices() { +func (s *scannerWorkflowsSuite) TestGetBatchIndices() { testCases := []struct { batchSize int concurrency int @@ -101,7 +101,7 @@ func (s *workflowsSuite) TestGetBatchIndices() { } } -func (s *workflowsSuite) TestGetShardBatches() { +func (s *scannerWorkflowsSuite) TestGetShardBatches() { var shards []int for i := 5; i < 50; i += 2 { shards = append(shards, i) @@ -113,7 +113,7 @@ func (s *workflowsSuite) TestGetShardBatches() { }, batches) } -func (s *workflowsSuite) TestFlattenShards() { +func (s *scannerWorkflowsSuite) TestFlattenShards() { testCases := []struct { input Shards expectedList []int @@ -160,7 +160,7 @@ func (s *workflowsSuite) TestFlattenShards() { } } -func (s *workflowsSuite) TestValidateShards() { +func (s *scannerWorkflowsSuite) TestValidateShards() { testCases := []struct { shards Shards expectErr bool diff --git a/service/worker/scanner/shardscanner/shardscannertest/workflow.go b/service/worker/scanner/shardscanner/shardscannertest/workflow.go deleted file mode 100644 index 920b5f46be0..00000000000 --- a/service/worker/scanner/shardscanner/shardscannertest/workflow.go +++ /dev/null @@ -1,50 +0,0 @@ -// The MIT License (MIT) -// -// Copyright (c) 2017-2020 Uber Technologies Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -package shardscannertest - -import ( - "go.uber.org/cadence/workflow" - - "github.com/uber/cadence/service/worker/scanner/shardscanner" -) - -// NewTestWorkflow is a helper, no-op workflow used for testing purposes. -func NewTestWorkflow(ctx workflow.Context, name string, params shardscanner.ScannerWorkflowParams) error { - wf, err := shardscanner.NewScannerWorkflow(ctx, name, params) - if err != nil { - return err - } - - return wf.Start(ctx) -} - -// NewTestFixerWorkflow is a helper, no-op workflow used for testing purposes. -func NewTestFixerWorkflow(ctx workflow.Context, params shardscanner.FixerWorkflowParams) error { - wf, err := shardscanner.NewFixerWorkflow(ctx, "test-fixer", params) - if err != nil { - return err - } - - return wf.Start(ctx) - -} diff --git a/service/worker/scanner/shardscanner/shardscannertest/workflow_test.go b/service/worker/scanner/shardscanner/shardscannertest/workflow_test.go deleted file mode 100644 index a3d3ac97339..00000000000 --- a/service/worker/scanner/shardscanner/shardscannertest/workflow_test.go +++ /dev/null @@ -1,478 +0,0 @@ -// The MIT License (MIT) -// -// Copyright (c) 2017-2020 Uber Technologies Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -package shardscannertest - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/suite" - "go.uber.org/cadence/testsuite" - "go.uber.org/cadence/workflow" - - "github.com/uber/cadence/common" - "github.com/uber/cadence/common/reconciliation/invariant" - "github.com/uber/cadence/common/reconciliation/store" - "github.com/uber/cadence/service/worker/scanner/shardscanner" -) - -type workflowsSuite struct { - suite.Suite - testsuite.WorkflowTestSuite -} - -func TestScannerWorkflowSuite(t *testing.T) { - suite.Run(t, new(workflowsSuite)) -} - -func (s *workflowsSuite) SetupSuite() { - workflow.Register(NewTestWorkflow) - workflow.Register(NewTestFixerWorkflow) - workflow.Register(shardscanner.GetCorruptedKeys) -} - -func (s *workflowsSuite) TestScannerWorkflow_Failure_ScanShard() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityScannerConfig, mock.Anything, mock.Anything).Return(shardscanner.ResolvedScannerWorkflowConfig{ - GenericScannerConfig: shardscanner.GenericScannerConfig{ - Enabled: true, - Concurrency: 3, - ActivityBatchSize: 5, - }, - }, nil) - shards := shardscanner.Shards{ - Range: &shardscanner.ShardRange{ - Min: 0, - Max: 30, - }, - } - - batches := [][]int{ - {0, 3, 6, 9, 12}, - {15, 18, 21, 24, 27}, - {1, 4, 7, 10, 13}, - {16, 19, 22, 25, 28}, - {2, 5, 8, 11, 14}, - {17, 20, 23, 26, 29}, - } - - for i, batch := range batches { - - var reports []shardscanner.ScanReport - var err error - if i == len(batches)-1 { - reports = nil - err = errors.New("scan shard activity got error") - } else { - err = nil - for _, s := range batch { - reports = append(reports, shardscanner.ScanReport{ - ShardID: s, - Stats: shardscanner.ScanStats{ - EntitiesCount: 10, - }, - Result: shardscanner.ScanResult{ - ControlFlowFailure: &shardscanner.ControlFlowFailure{ - Info: "got control flow failure", - }, - }, - }) - } - } - env.OnActivity(shardscanner.ActivityScanShard, mock.Anything, shardscanner.ScanShardActivityParams{ - Shards: batch, - }).Return(reports, err) - } - env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", shardscanner.ScannerWorkflowParams{ - Shards: shards, - }) - s.True(env.IsWorkflowCompleted()) - s.Equal("scan shard activity got error", env.GetWorkflowError().Error()) -} - -func (s *workflowsSuite) TestScannerWorkflow_Failure_ScannerConfigActivity() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityScannerConfig, mock.Anything, mock.Anything).Return(shardscanner.ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) - env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", shardscanner.ScannerWorkflowParams{ - Shards: shardscanner.Shards{ - List: []int{1, 2, 3}, - }, - }) - s.True(env.IsWorkflowCompleted()) - s.Equal("got error getting config", env.GetWorkflowError().Error()) -} - -func (s *workflowsSuite) TestScannerWorkflow_Requires_Name() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityScannerConfig, mock.Anything, mock.Anything).Return(shardscanner.ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) - env.ExecuteWorkflow(NewTestWorkflow, "", shardscanner.ScannerWorkflowParams{ - Shards: shardscanner.Shards{ - List: []int{1, 2, 3}, - }, - }) - s.True(env.IsWorkflowCompleted()) - s.Equal("workflow name is not provided", env.GetWorkflowError().Error()) -} - -func (s *workflowsSuite) TestScannerWorkflow_Requires_Valid_ShardConfig() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityScannerConfig, mock.Anything, mock.Anything).Return(shardscanner.ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) - env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", shardscanner.ScannerWorkflowParams{}) - s.True(env.IsWorkflowCompleted()) - s.Equal("must provide either List or Range", env.GetWorkflowError().Error()) -} - -func (s *workflowsSuite) TestScannerWorkflow_Success_Disabled() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityScannerConfig, mock.Anything, mock.Anything).Return(shardscanner.ResolvedScannerWorkflowConfig{ - GenericScannerConfig: shardscanner.GenericScannerConfig{ - Enabled: false, - }, - }, nil) - - env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", shardscanner.ScannerWorkflowParams{ - Shards: shardscanner.Shards{ - List: []int{1, 2, 3}, - }, - }) - - s.True(env.IsWorkflowCompleted()) - s.NoError(env.GetWorkflowError()) -} - -func (s *workflowsSuite) TestFixerWorkflow_Success() { - env := s.NewTestWorkflowEnvironment() - corruptedKeys := make([]shardscanner.CorruptedKeysEntry, 30) - for i := 0; i < 30; i++ { - corruptedKeys[i] = shardscanner.CorruptedKeysEntry{ - ShardID: i, - } - } - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, mock.Anything).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: corruptedKeys, - MinShard: common.IntPtr(0), - MaxShard: common.IntPtr(29), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - IsDone: true, - NextShardID: nil, - }, - }, nil) - - enabledFixInvariants := shardscanner.CustomScannerConfig{ - // historically enabled by default - invariant.CollectionHistory.String(): "true", - invariant.CollectionMutableState.String(): "true", - // disabled by default - invariant.CollectionStale.String(): "false", - } - env.OnActivity(shardscanner.ActivityFixerConfig, mock.Anything, shardscanner.FixShardConfigParams{ /* no contents currently */ }).Return(&shardscanner.FixShardConfigResults{ - EnabledInvariants: enabledFixInvariants, - }, nil) - - fixerWorkflowConfigOverwrites := shardscanner.FixerWorkflowConfigOverwrites{ - Concurrency: common.IntPtr(3), - BlobstoreFlushThreshold: common.IntPtr(1000), - ActivityBatchSize: common.IntPtr(5), - } - resolvedFixerWorkflowConfig := shardscanner.ResolvedFixerWorkflowConfig{ - Concurrency: 3, - ActivityBatchSize: 5, - BlobstoreFlushThreshold: 1000, - } - batches := [][]int{ - {0, 3, 6, 9, 12}, - {15, 18, 21, 24, 27}, - {1, 4, 7, 10, 13}, - {16, 19, 22, 25, 28}, - {2, 5, 8, 11, 14}, - {17, 20, 23, 26, 29}, - } - - for _, batch := range batches { - var corruptedKeys []shardscanner.CorruptedKeysEntry - for _, shard := range batch { - corruptedKeys = append(corruptedKeys, shardscanner.CorruptedKeysEntry{ - ShardID: shard, - }) - } - var reports []shardscanner.FixReport - for i, s := range batch { - if i == 0 { - reports = append(reports, shardscanner.FixReport{ - ShardID: s, - Stats: shardscanner.FixStats{ - EntitiesCount: 10, - }, - Result: shardscanner.FixResult{ - ControlFlowFailure: &shardscanner.ControlFlowFailure{ - Info: "got control flow failure", - }, - }, - }) - } else { - reports = append(reports, shardscanner.FixReport{ - ShardID: s, - Stats: shardscanner.FixStats{ - EntitiesCount: 10, - FixedCount: 2, - SkippedCount: 1, - FailedCount: 1, - }, - Result: shardscanner.FixResult{ - ShardFixKeys: &shardscanner.FixKeys{ - Skipped: &store.Keys{ - UUID: "skipped_keys", - }, - Failed: &store.Keys{ - UUID: "failed_keys", - }, - Fixed: &store.Keys{ - UUID: "fixed_keys", - }, - }, - }, - }) - } - } - env.OnActivity(shardscanner.ActivityFixShard, mock.Anything, shardscanner.FixShardActivityParams{ - CorruptedKeysEntries: corruptedKeys, - ResolvedFixerWorkflowConfig: resolvedFixerWorkflowConfig, - EnabledInvariants: enabledFixInvariants, - }).Return(reports, nil) - } - - env.ExecuteWorkflow(NewTestFixerWorkflow, shardscanner.FixerWorkflowParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - FixerWorkflowConfigOverwrites: fixerWorkflowConfigOverwrites, - }) - s.True(env.IsWorkflowCompleted()) - s.NoError(env.GetWorkflowError()) - - aggValue, err := env.QueryWorkflow(shardscanner.AggregateReportQuery) - s.NoError(err) - var agg shardscanner.AggregateFixReportResult - s.NoError(aggValue.Get(&agg)) - s.Equal(shardscanner.AggregateFixReportResult{ - EntitiesCount: 240, - FixedCount: 48, - FailedCount: 24, - SkippedCount: 24, - }, agg) - - for i := 0; i < 30; i++ { - shardReportValue, err := env.QueryWorkflow(shardscanner.ShardReportQuery, i) - s.NoError(err) - var shardReport *shardscanner.FixReport - s.NoError(shardReportValue.Get(&shardReport)) - if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { - s.Equal(&shardscanner.FixReport{ - ShardID: i, - Stats: shardscanner.FixStats{ - EntitiesCount: 10, - }, - Result: shardscanner.FixResult{ - ControlFlowFailure: &shardscanner.ControlFlowFailure{ - Info: "got control flow failure", - }, - }, - }, shardReport) - } else { - s.Equal(&shardscanner.FixReport{ - ShardID: i, - Stats: shardscanner.FixStats{ - EntitiesCount: 10, - FixedCount: 2, - FailedCount: 1, - SkippedCount: 1, - }, - Result: shardscanner.FixResult{ - ShardFixKeys: &shardscanner.FixKeys{ - Skipped: &store.Keys{ - UUID: "skipped_keys", - }, - Failed: &store.Keys{ - UUID: "failed_keys", - }, - Fixed: &store.Keys{ - UUID: "fixed_keys", - }, - }, - }, - }, shardReport) - } - } - - statusValue, err := env.QueryWorkflow(shardscanner.ShardStatusQuery, shardscanner.PaginatedShardQueryRequest{}) - s.NoError(err) - var status *shardscanner.ShardStatusQueryResult - s.NoError(statusValue.Get(&status)) - expected := make(map[int]shardscanner.ShardStatus) - for i := 0; i < 30; i++ { - if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { - expected[i] = shardscanner.ShardStatusControlFlowFailure - } else { - expected[i] = shardscanner.ShardStatusSuccess - } - } - s.Equal(shardscanner.ShardStatusResult(expected), status.Result) - - // check for paginated query result - statusValue, err = env.QueryWorkflow(shardscanner.ShardStatusQuery, shardscanner.PaginatedShardQueryRequest{ - StartingShardID: common.IntPtr(5), - LimitShards: common.IntPtr(10), - }) - s.NoError(err) - status = &shardscanner.ShardStatusQueryResult{} - s.NoError(statusValue.Get(&status)) - expected = make(map[int]shardscanner.ShardStatus) - for i := 5; i < 15; i++ { - if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { - expected[i] = shardscanner.ShardStatusControlFlowFailure - } else { - expected[i] = shardscanner.ShardStatusSuccess - } - } - s.Equal(shardscanner.ShardStatusResult(expected), status.Result) - s.False(status.ShardQueryPaginationToken.IsDone) - s.Equal(15, *status.ShardQueryPaginationToken.NextShardID) -} - -func (s *workflowsSuite) TestGetCorruptedKeys_Success() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: nil, - }).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{{ShardID: 1}, {ShardID: 5}, {ShardID: 10}}, - MinShard: common.IntPtr(1), - MaxShard: common.IntPtr(10), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: common.IntPtr(11), - IsDone: false, - }, - }, nil) - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: common.IntPtr(11), - }).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{{ShardID: 11}, {ShardID: 12}}, - MinShard: common.IntPtr(11), - MaxShard: common.IntPtr(12), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: common.IntPtr(13), - IsDone: false, - }, - }, nil) - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: common.IntPtr(13), - }).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{{ShardID: 20}, {ShardID: 41}}, - MinShard: common.IntPtr(20), - MaxShard: common.IntPtr(41), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: common.IntPtr(42), - IsDone: false, - }, - }, nil) - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: common.IntPtr(42), - }).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{}, - MinShard: nil, - MaxShard: nil, - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: nil, - IsDone: true, - }, - }, nil) - - env.ExecuteWorkflow(shardscanner.GetCorruptedKeys, shardscanner.FixerWorkflowParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - }) - s.True(env.IsWorkflowCompleted()) - s.NoError(env.GetWorkflowError()) - var result *shardscanner.FixerCorruptedKeysActivityResult - s.NoError(env.GetWorkflowResult(&result)) - s.Equal(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{ - {ShardID: 1}, - {ShardID: 5}, - {ShardID: 10}, - {ShardID: 11}, - {ShardID: 12}, - {ShardID: 20}, - {ShardID: 41}, - }, - MinShard: common.IntPtr(1), - MaxShard: common.IntPtr(41), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: nil, - IsDone: true, - }, - }, result) -} - -func (s *workflowsSuite) TestGetCorruptedKeys_Error() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: nil, - }).Return(&shardscanner.FixerCorruptedKeysActivityResult{ - CorruptedKeys: []shardscanner.CorruptedKeysEntry{{ShardID: 1}, {ShardID: 5}, {ShardID: 10}}, - MinShard: common.IntPtr(1), - MaxShard: common.IntPtr(10), - ShardQueryPaginationToken: shardscanner.ShardQueryPaginationToken{ - NextShardID: common.IntPtr(11), - IsDone: false, - }, - }, nil) - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, shardscanner.FixerCorruptedKeysActivityParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - StartingShardID: common.IntPtr(11), - }).Return(nil, errors.New("got error")) - env.ExecuteWorkflow(shardscanner.GetCorruptedKeys, shardscanner.FixerWorkflowParams{ - ScannerWorkflowWorkflowID: "test_wid", - ScannerWorkflowRunID: "test_rid", - }) - s.True(env.IsWorkflowCompleted()) - s.Error(env.GetWorkflowError()) -} - -func (s *workflowsSuite) TestScannerWorkflow_Failure_CorruptedKeysActivity() { - env := s.NewTestWorkflowEnvironment() - env.OnActivity(shardscanner.ActivityFixerCorruptedKeys, mock.Anything, mock.Anything).Return(nil, errors.New("got error getting corrupted keys")) - env.ExecuteWorkflow(NewTestFixerWorkflow, shardscanner.FixerWorkflowParams{}) - s.True(env.IsWorkflowCompleted()) - s.Equal("got error getting corrupted keys", env.GetWorkflowError().Error()) -} diff --git a/service/worker/scanner/shardscanner/workflows_test.go b/service/worker/scanner/shardscanner/workflows_test.go new file mode 100644 index 00000000000..10065081daa --- /dev/null +++ b/service/worker/scanner/shardscanner/workflows_test.go @@ -0,0 +1,489 @@ +// The MIT License (MIT) +// +// Copyright (c) 2017-2020 Uber Technologies Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package shardscanner + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + "go.uber.org/cadence/testsuite" + "go.uber.org/cadence/workflow" + + "github.com/uber/cadence/common" + "github.com/uber/cadence/common/reconciliation/invariant" + "github.com/uber/cadence/common/reconciliation/store" +) + +type workflowsSuite struct { + suite.Suite + testsuite.WorkflowTestSuite + env *testsuite.TestWorkflowEnvironment +} + +func TestWorkflowsSuite(t *testing.T) { + suite.Run(t, new(workflowsSuite)) +} + +func (s *workflowsSuite) SetupTest() { + s.env = s.WorkflowTestSuite.NewTestWorkflowEnvironment() + s.env.RegisterWorkflow(NewTestWorkflow) + s.env.RegisterWorkflow(NewTestFixerWorkflow) + s.env.RegisterWorkflow(GetCorruptedKeys) +} + +func (s *workflowsSuite) TestScannerWorkflow_Failure_ScanShard() { + s.env.OnActivity(ActivityScannerConfig, mock.Anything, mock.Anything).Return(ResolvedScannerWorkflowConfig{ + GenericScannerConfig: GenericScannerConfig{ + Enabled: true, + Concurrency: 3, + ActivityBatchSize: 5, + }, + }, nil) + shards := Shards{ + Range: &ShardRange{ + Min: 0, + Max: 30, + }, + } + + batches := [][]int{ + {0, 3, 6, 9, 12}, + {15, 18, 21, 24, 27}, + {1, 4, 7, 10, 13}, + {16, 19, 22, 25, 28}, + {2, 5, 8, 11, 14}, + {17, 20, 23, 26, 29}, + } + + for i, batch := range batches { + + var reports []ScanReport + var err error + if i == len(batches)-1 { + reports = nil + err = errors.New("scan shard activity got error") + } else { + err = nil + for _, s := range batch { + reports = append(reports, ScanReport{ + ShardID: s, + Stats: ScanStats{ + EntitiesCount: 10, + }, + Result: ScanResult{ + ControlFlowFailure: &ControlFlowFailure{ + Info: "got control flow failure", + }, + }, + }) + } + } + s.env.OnActivity(ActivityScanShard, mock.Anything, ScanShardActivityParams{ + Shards: batch, + }).Return(reports, err) + } + s.env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", ScannerWorkflowParams{ + Shards: shards, + }) + s.True(s.env.IsWorkflowCompleted()) + s.Equal("scan shard activity got error", s.env.GetWorkflowError().Error()) +} + +func (s *workflowsSuite) TestScannerWorkflow_Failure_ScannerConfigActivity() { + s.env.OnActivity(ActivityScannerConfig, mock.Anything, mock.Anything).Return(ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) + s.env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", ScannerWorkflowParams{ + Shards: Shards{ + List: []int{1, 2, 3}, + }, + }) + s.True(s.env.IsWorkflowCompleted()) + s.Equal("got error getting config", s.env.GetWorkflowError().Error()) +} + +func (s *workflowsSuite) TestScannerWorkflow_Requires_Name() { + s.env.OnActivity(ActivityScannerConfig, mock.Anything, mock.Anything).Return(ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) + s.env.ExecuteWorkflow(NewTestWorkflow, "", ScannerWorkflowParams{ + Shards: Shards{ + List: []int{1, 2, 3}, + }, + }) + s.True(s.env.IsWorkflowCompleted()) + s.Equal("workflow name is not provided", s.env.GetWorkflowError().Error()) +} + +func (s *workflowsSuite) TestScannerWorkflow_Requires_Valid_ShardConfig() { + s.env.OnActivity(ActivityScannerConfig, mock.Anything, mock.Anything).Return(ResolvedScannerWorkflowConfig{}, errors.New("got error getting config")) + s.env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", ScannerWorkflowParams{}) + s.True(s.env.IsWorkflowCompleted()) + s.Equal("must provide either List or Range", s.env.GetWorkflowError().Error()) +} + +func (s *workflowsSuite) TestScannerWorkflow_Success_Disabled() { + s.env.OnActivity(ActivityScannerConfig, mock.Anything, mock.Anything).Return(ResolvedScannerWorkflowConfig{ + GenericScannerConfig: GenericScannerConfig{ + Enabled: false, + }, + }, nil) + + s.env.ExecuteWorkflow(NewTestWorkflow, "test-workflow", ScannerWorkflowParams{ + Shards: Shards{ + List: []int{1, 2, 3}, + }, + }) + + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) +} + +func (s *workflowsSuite) TestFixerWorkflow_Success() { + corruptedKeys := make([]CorruptedKeysEntry, 30) + for i := 0; i < 30; i++ { + corruptedKeys[i] = CorruptedKeysEntry{ + ShardID: i, + } + } + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, mock.Anything).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: corruptedKeys, + MinShard: common.IntPtr(0), + MaxShard: common.IntPtr(29), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + IsDone: true, + NextShardID: nil, + }, + }, nil) + + enabledFixInvariants := CustomScannerConfig{ + // historically enabled by default + invariant.CollectionHistory.String(): "true", + invariant.CollectionMutableState.String(): "true", + // disabled by default + invariant.CollectionStale.String(): "false", + } + s.env.OnActivity(ActivityFixerConfig, mock.Anything, FixShardConfigParams{ /* no contents currently */ }).Return(&FixShardConfigResults{ + EnabledInvariants: enabledFixInvariants, + }, nil) + + fixerWorkflowConfigOverwrites := FixerWorkflowConfigOverwrites{ + Concurrency: common.IntPtr(3), + BlobstoreFlushThreshold: common.IntPtr(1000), + ActivityBatchSize: common.IntPtr(5), + } + resolvedFixerWorkflowConfig := ResolvedFixerWorkflowConfig{ + Concurrency: 3, + ActivityBatchSize: 5, + BlobstoreFlushThreshold: 1000, + } + batches := [][]int{ + {0, 3, 6, 9, 12}, + {15, 18, 21, 24, 27}, + {1, 4, 7, 10, 13}, + {16, 19, 22, 25, 28}, + {2, 5, 8, 11, 14}, + {17, 20, 23, 26, 29}, + } + + for _, batch := range batches { + var corruptedKeys []CorruptedKeysEntry + for _, shard := range batch { + corruptedKeys = append(corruptedKeys, CorruptedKeysEntry{ + ShardID: shard, + }) + } + var reports []FixReport + for i, s := range batch { + if i == 0 { + reports = append(reports, FixReport{ + ShardID: s, + Stats: FixStats{ + EntitiesCount: 10, + }, + Result: FixResult{ + ControlFlowFailure: &ControlFlowFailure{ + Info: "got control flow failure", + }, + }, + }) + } else { + reports = append(reports, FixReport{ + ShardID: s, + Stats: FixStats{ + EntitiesCount: 10, + FixedCount: 2, + SkippedCount: 1, + FailedCount: 1, + }, + Result: FixResult{ + ShardFixKeys: &FixKeys{ + Skipped: &store.Keys{ + UUID: "skipped_keys", + }, + Failed: &store.Keys{ + UUID: "failed_keys", + }, + Fixed: &store.Keys{ + UUID: "fixed_keys", + }, + }, + }, + }) + } + } + s.env.OnActivity(ActivityFixShard, mock.Anything, FixShardActivityParams{ + CorruptedKeysEntries: corruptedKeys, + ResolvedFixerWorkflowConfig: resolvedFixerWorkflowConfig, + EnabledInvariants: enabledFixInvariants, + }).Return(reports, nil) + } + + s.env.ExecuteWorkflow(NewTestFixerWorkflow, FixerWorkflowParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + FixerWorkflowConfigOverwrites: fixerWorkflowConfigOverwrites, + }) + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) + + aggValue, err := s.env.QueryWorkflow(AggregateReportQuery) + s.NoError(err) + var agg AggregateFixReportResult + s.NoError(aggValue.Get(&agg)) + s.Equal(AggregateFixReportResult{ + EntitiesCount: 240, + FixedCount: 48, + FailedCount: 24, + SkippedCount: 24, + }, agg) + + for i := 0; i < 30; i++ { + shardReportValue, err := s.env.QueryWorkflow(ShardReportQuery, i) + s.NoError(err) + var shardReport *FixReport + s.NoError(shardReportValue.Get(&shardReport)) + if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { + s.Equal(&FixReport{ + ShardID: i, + Stats: FixStats{ + EntitiesCount: 10, + }, + Result: FixResult{ + ControlFlowFailure: &ControlFlowFailure{ + Info: "got control flow failure", + }, + }, + }, shardReport) + } else { + s.Equal(&FixReport{ + ShardID: i, + Stats: FixStats{ + EntitiesCount: 10, + FixedCount: 2, + FailedCount: 1, + SkippedCount: 1, + }, + Result: FixResult{ + ShardFixKeys: &FixKeys{ + Skipped: &store.Keys{ + UUID: "skipped_keys", + }, + Failed: &store.Keys{ + UUID: "failed_keys", + }, + Fixed: &store.Keys{ + UUID: "fixed_keys", + }, + }, + }, + }, shardReport) + } + } + + statusValue, err := s.env.QueryWorkflow(ShardStatusQuery, PaginatedShardQueryRequest{}) + s.NoError(err) + var status *ShardStatusQueryResult + s.NoError(statusValue.Get(&status)) + expected := make(map[int]ShardStatus) + for i := 0; i < 30; i++ { + if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { + expected[i] = ShardStatusControlFlowFailure + } else { + expected[i] = ShardStatusSuccess + } + } + s.Equal(ShardStatusResult(expected), status.Result) + + // check for paginated query result + statusValue, err = s.env.QueryWorkflow(ShardStatusQuery, PaginatedShardQueryRequest{ + StartingShardID: common.IntPtr(5), + LimitShards: common.IntPtr(10), + }) + s.NoError(err) + status = &ShardStatusQueryResult{} + s.NoError(statusValue.Get(&status)) + expected = make(map[int]ShardStatus) + for i := 5; i < 15; i++ { + if i == 0 || i == 1 || i == 2 || i == 15 || i == 16 || i == 17 { + expected[i] = ShardStatusControlFlowFailure + } else { + expected[i] = ShardStatusSuccess + } + } + s.Equal(ShardStatusResult(expected), status.Result) + s.False(status.ShardQueryPaginationToken.IsDone) + s.Equal(15, *status.ShardQueryPaginationToken.NextShardID) +} + +func (s *workflowsSuite) TestGetCorruptedKeys_Success() { + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: nil, + }).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{{ShardID: 1}, {ShardID: 5}, {ShardID: 10}}, + MinShard: common.IntPtr(1), + MaxShard: common.IntPtr(10), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: common.IntPtr(11), + IsDone: false, + }, + }, nil) + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: common.IntPtr(11), + }).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{{ShardID: 11}, {ShardID: 12}}, + MinShard: common.IntPtr(11), + MaxShard: common.IntPtr(12), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: common.IntPtr(13), + IsDone: false, + }, + }, nil) + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: common.IntPtr(13), + }).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{{ShardID: 20}, {ShardID: 41}}, + MinShard: common.IntPtr(20), + MaxShard: common.IntPtr(41), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: common.IntPtr(42), + IsDone: false, + }, + }, nil) + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: common.IntPtr(42), + }).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{}, + MinShard: nil, + MaxShard: nil, + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: nil, + IsDone: true, + }, + }, nil) + + s.env.ExecuteWorkflow(GetCorruptedKeys, FixerWorkflowParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + }) + s.True(s.env.IsWorkflowCompleted()) + s.NoError(s.env.GetWorkflowError()) + var result *FixerCorruptedKeysActivityResult + s.NoError(s.env.GetWorkflowResult(&result)) + s.Equal(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{ + {ShardID: 1}, + {ShardID: 5}, + {ShardID: 10}, + {ShardID: 11}, + {ShardID: 12}, + {ShardID: 20}, + {ShardID: 41}, + }, + MinShard: common.IntPtr(1), + MaxShard: common.IntPtr(41), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: nil, + IsDone: true, + }, + }, result) +} + +func (s *workflowsSuite) TestGetCorruptedKeys_Error() { + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: nil, + }).Return(&FixerCorruptedKeysActivityResult{ + CorruptedKeys: []CorruptedKeysEntry{{ShardID: 1}, {ShardID: 5}, {ShardID: 10}}, + MinShard: common.IntPtr(1), + MaxShard: common.IntPtr(10), + ShardQueryPaginationToken: ShardQueryPaginationToken{ + NextShardID: common.IntPtr(11), + IsDone: false, + }, + }, nil) + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, FixerCorruptedKeysActivityParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + StartingShardID: common.IntPtr(11), + }).Return(nil, errors.New("got error")) + s.env.ExecuteWorkflow(GetCorruptedKeys, FixerWorkflowParams{ + ScannerWorkflowWorkflowID: "test_wid", + ScannerWorkflowRunID: "test_rid", + }) + s.True(s.env.IsWorkflowCompleted()) + s.Error(s.env.GetWorkflowError()) +} + +func (s *workflowsSuite) TestScannerWorkflow_Failure_CorruptedKeysActivity() { + s.env.OnActivity(ActivityFixerCorruptedKeys, mock.Anything, mock.Anything).Return(nil, errors.New("got error getting corrupted keys")) + s.env.ExecuteWorkflow(NewTestFixerWorkflow, FixerWorkflowParams{}) + s.True(s.env.IsWorkflowCompleted()) + s.Equal("got error getting corrupted keys", s.env.GetWorkflowError().Error()) +} + +func NewTestWorkflow(ctx workflow.Context, name string, params ScannerWorkflowParams) error { + wf, err := NewScannerWorkflow(ctx, name, params) + if err != nil { + return err + } + + return wf.Start(ctx) +} + +func NewTestFixerWorkflow(ctx workflow.Context, params FixerWorkflowParams) error { + wf, err := NewFixerWorkflow(ctx, "test-fixer", params) + if err != nil { + return err + } + + return wf.Start(ctx) + +} From 1e8c3743f616bbc9852730171cb1321a80095dc8 Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Tue, 7 May 2024 14:22:53 -0700 Subject: [PATCH 10/15] Change after review --- common/domain/replication_queue_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index 475fb0a1dcb..a8e33c76a5f 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -730,8 +730,8 @@ func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { rq.Stop() select { case <-done: - // Pass if the goroutine exits - case <-time.After(1 * time.Millisecond): + // Pass if the goroutine exitsit + case <-time.After(10 * time.Millisecond): t.Error("purgeProcessor did not stop within expected time") } } From c8da432e1a69426fcba69e0f3d7fce6f727783f0 Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Tue, 7 May 2024 14:25:58 -0700 Subject: [PATCH 11/15] comment spell check --- common/domain/replication_queue_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index a8e33c76a5f..6338350f4ea 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -730,7 +730,7 @@ func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { rq.Stop() select { case <-done: - // Pass if the goroutine exitsit + // Pass if the goroutine exits case <-time.After(10 * time.Millisecond): t.Error("purgeProcessor did not stop within expected time") } From ae89f34edc98256db560e3db4333fddcc6be855e Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Tue, 7 May 2024 22:53:12 -0700 Subject: [PATCH 12/15] review changes --- common/domain/replication_queue_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index 6338350f4ea..2c1796b9fca 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -715,14 +715,14 @@ func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { ctrl := gomock.NewController(t) mockQueue := persistence.NewMockQueueManager(ctrl) rq := NewReplicationQueue(mockQueue, "testCluster", nil, nil).(*replicationQueueImpl) - atomic.StoreInt32(&rq.status, common.DaemonStatusStarted) + atomic.StoreInt32(&rq.status, common.DaemonStatusInitialized) done := make(chan bool) mockQueue.EXPECT().GetAckLevels(gomock.Any()).Return(map[string]int64{}, nil).AnyTimes() mockQueue.EXPECT().DeleteMessagesBefore(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() go func() { - rq.purgeProcessor() + rq.Start() close(done) }() From 7f63f7ead7c93ede0507a746ebb5bdb9f41d32d6 Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Tue, 7 May 2024 22:57:40 -0700 Subject: [PATCH 13/15] Update common/domain/replication_queue_test.go Co-authored-by: Steven L --- common/domain/replication_queue_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index 2c1796b9fca..687173fd680 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -75,12 +75,11 @@ func TestReplicationQueueImpl_Start(t *testing.T) { assert.Equal(t, tt.expectedStatus, atomic.LoadInt32(&rq.status)) if tt.shouldStart { - time.Sleep(1 * time.Nanosecond) select { case <-rq.done: t.Error("purgeProcessor should not have stopped") - default: - // expected no action + case <-time.After(time.Millisecond): + // expected, as the purgeProcessor should still be running } } }) From 4825cd98c23bc4b94dbfb09ecb13e1929ae4a527 Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Tue, 7 May 2024 22:59:20 -0700 Subject: [PATCH 14/15] removed sleep --- common/domain/replication_queue_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index 687173fd680..8dfadb6fef9 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -725,7 +725,6 @@ func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { close(done) }() - time.Sleep(1 * time.Nanosecond) rq.Stop() select { case <-done: From b9d24b3a5d042905e040c3b00fe6d907186330ae Mon Sep 17 00:00:00 2001 From: Abhishek Jha Date: Wed, 5 Jun 2024 11:16:48 -0700 Subject: [PATCH 15/15] Update replication_queue_test.go --- common/domain/replication_queue_test.go | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/common/domain/replication_queue_test.go b/common/domain/replication_queue_test.go index 8dfadb6fef9..9fd7174aab1 100644 --- a/common/domain/replication_queue_test.go +++ b/common/domain/replication_queue_test.go @@ -709,27 +709,3 @@ func TestPurgeAckedMessages(t *testing.T) { }) } } - -func TestReplicationQueueImpl_purgeProcessor(t *testing.T) { - ctrl := gomock.NewController(t) - mockQueue := persistence.NewMockQueueManager(ctrl) - rq := NewReplicationQueue(mockQueue, "testCluster", nil, nil).(*replicationQueueImpl) - atomic.StoreInt32(&rq.status, common.DaemonStatusInitialized) - - done := make(chan bool) - mockQueue.EXPECT().GetAckLevels(gomock.Any()).Return(map[string]int64{}, nil).AnyTimes() - mockQueue.EXPECT().DeleteMessagesBefore(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - - go func() { - rq.Start() - close(done) - }() - - rq.Stop() - select { - case <-done: - // Pass if the goroutine exits - case <-time.After(10 * time.Millisecond): - t.Error("purgeProcessor did not stop within expected time") - } -}