Skip to content

Commit

Permalink
[Wf-Diagnostics] Incorporate retry diagnostics in workflow diagnostic…
Browse files Browse the repository at this point in the history
…s workflow (#6532)

* [Wf-Diagnostics] Incorporate retry diagnostics in workflow diagnostics workflow

* lint updates
  • Loading branch information
sankari165 authored Dec 3, 2024
1 parent 5d60abd commit d51adac
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 0 deletions.
10 changes: 10 additions & 0 deletions service/worker/diagnostics/activities.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/uber/cadence/service/worker/diagnostics/analytics"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -82,6 +83,15 @@ func (w *dw) identifyIssues(ctx context.Context, info identifyIssuesParams) ([]i
}
result = append(result, failureIssues...)

retryInvariant := retry.NewInvariant(retry.Params{
WorkflowExecutionHistory: info.History,
})
retryIssues, err := retryInvariant.Check(ctx)
if err != nil {
return nil, err
}
result = append(result, retryIssues...)

return result, nil
}

Expand Down
22 changes: 22 additions & 0 deletions service/worker/diagnostics/activities_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/uber/cadence/service/worker/diagnostics/analytics"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -85,6 +86,10 @@ func Test__identifyIssues(t *testing.T) {
ActivityScheduled: &types.ActivityTaskScheduledEventAttributes{
ActivityID: "101",
ActivityType: &types.ActivityType{Name: "test-activity"},
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
},
ActivityStarted: &types.ActivityTaskStartedEventAttributes{
Identity: "localhost",
Expand All @@ -93,6 +98,14 @@ func Test__identifyIssues(t *testing.T) {
}
actMetadataInBytes, err := json.Marshal(actMetadata)
require.NoError(t, err)
retryMetadata := retry.RetryMetadata{
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
}
retryMetadataInBytes, err := json.Marshal(retryMetadata)
require.NoError(t, err)
expectedResult := []invariant.InvariantCheckResult{
{
InvariantType: timeout.TimeoutTypeExecution.String(),
Expand All @@ -104,6 +117,11 @@ func Test__identifyIssues(t *testing.T) {
Reason: failure.GenericError.String(),
Metadata: actMetadataInBytes,
},
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
}
result, err := dwtest.identifyIssues(context.Background(), identifyIssuesParams{History: testWorkflowExecutionHistoryResponse()})
require.NoError(t, err)
Expand Down Expand Up @@ -219,6 +237,10 @@ func testWorkflowExecutionHistoryResponse() *types.GetWorkflowExecutionHistoryRe
ActivityTaskScheduledEventAttributes: &types.ActivityTaskScheduledEventAttributes{
ActivityID: "101",
ActivityType: &types.ActivityType{Name: "test-activity"},
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
},
},
{
Expand Down
40 changes: 40 additions & 0 deletions service/worker/diagnostics/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand All @@ -54,6 +55,7 @@ type DiagnosticsWorkflowInput struct {
type DiagnosticsWorkflowResult struct {
Timeouts *timeoutDiagnostics
Failures *failureDiagnostics
Retries *retryDiagnostics
}

type timeoutDiagnostics struct {
Expand Down Expand Up @@ -89,6 +91,17 @@ type failuresIssuesResult struct {
Metadata failure.FailureMetadata
}

type retryDiagnostics struct {
Issues []*retryIssuesResult
Runbooks []string
}

type retryIssuesResult struct {
InvariantType string
Reason string
Metadata retry.RetryMetadata
}

func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflowInput) (*DiagnosticsWorkflowResult, error) {
scope := w.metricsClient.Scope(metrics.DiagnosticsWorkflowScope, metrics.DomainTag(params.Domain))
scope.IncCounter(metrics.DiagnosticsWorkflowStartedCount)
Expand All @@ -97,6 +110,7 @@ func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflo

var timeoutsResult timeoutDiagnostics
var failureResult failureDiagnostics
var retryResult retryDiagnostics
var checkResult []invariant.InvariantCheckResult
var rootCauseResult []invariant.InvariantRootCauseResult

Expand Down Expand Up @@ -158,10 +172,17 @@ func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflo
failureResult.RootCause = retrieveFailureRootCause(rootCauseResult)
failureResult.Runbooks = []string{linkToFailuresRunbook}

retryIssues, err := retrieveRetryIssues(checkResult)
if err != nil {
return nil, fmt.Errorf("RetrieveRetryIssues: %w", err)
}
retryResult.Issues = retryIssues

scope.IncCounter(metrics.DiagnosticsWorkflowSuccess)
return &DiagnosticsWorkflowResult{
Timeouts: &timeoutsResult,
Failures: &failureResult,
Retries: &retryResult,
}, nil
}

Expand Down Expand Up @@ -276,6 +297,25 @@ func retrieveFailureRootCause(rootCause []invariant.InvariantRootCauseResult) []
return result
}

func retrieveRetryIssues(issues []invariant.InvariantCheckResult) ([]*retryIssuesResult, error) {
result := make([]*retryIssuesResult, 0)
for _, issue := range issues {
if issue.InvariantType == retry.WorkflowRetryIssue.String() || issue.InvariantType == retry.WorkflowRetryInfo.String() || issue.InvariantType == retry.ActivityRetryIssue.String() {
var data retry.RetryMetadata
err := json.Unmarshal(issue.Metadata, &data)
if err != nil {
return nil, err
}
result = append(result, &retryIssuesResult{
InvariantType: issue.InvariantType,
Reason: issue.Reason,
Metadata: data,
})
}
}
return result, nil
}

func rootCauseHeartBeatRelated(rootCause invariant.RootCause) bool {
for _, rc := range []invariant.RootCause{invariant.RootCauseTypeNoHeartBeatTimeoutNoRetryPolicy,
invariant.RootCauseTypeHeartBeatingNotEnabledWithRetryPolicy,
Expand Down
39 changes: 39 additions & 0 deletions service/worker/diagnostics/workflow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -318,3 +319,41 @@ func (s *diagnosticsWorkflowTestSuite) Test__retrieveFailureIssues() {
s.NoError(err)
s.Equal(failureIssues, result)
}

func (s *diagnosticsWorkflowTestSuite) Test__retrieveRetryIssues() {
retryMetadata := retry.RetryMetadata{
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
}
retryMetadataInBytes, err := json.Marshal(retryMetadata)
s.NoError(err)
issues := []invariant.InvariantCheckResult{
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
{
InvariantType: retry.WorkflowRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
}
retryIssues := []*retryIssuesResult{
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadata,
},
{
InvariantType: retry.WorkflowRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadata,
},
}
result, err := retrieveRetryIssues(issues)
s.NoError(err)
s.Equal(retryIssues, result)
}

0 comments on commit d51adac

Please sign in to comment.