From 62f41d8c9643628973251d190cdf88f77b835135 Mon Sep 17 00:00:00 2001 From: Christie Wilson Date: Thu, 16 Apr 2020 11:40:08 -0400 Subject: [PATCH] =?UTF-8?q?Only=20mark=20PipelineRun=20as=20cancelled=20if?= =?UTF-8?q?=20actually=20cancelled=20=F0=9F=9B=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on #2369 (flakey tests around cancellation that actually revealed underlying bugs) we were running into a case where trying to cancel a PipelineRun's TaskRuns was failing (due to a race condition). In that case, the PipelineRun would be marked as cancelled and done, even though the PipelineRun was actually still executing in that one or more TaskRuns were actually still running. Now when that happens we will indicate that the PipelineRun is still running and return an error, which will mean that on the next reconcile, the Reconciler will try to reconcile again, and the PipelineRun conditions will reflect what is actually happening. Co-authored-by: Sharon Jerop Kipruto Fixes #2381 --- pkg/reconciler/pipelinerun/cancel.go | 30 ++++++---- pkg/reconciler/pipelinerun/pipelinerun.go | 6 ++ .../pipelinerun/pipelinerun_test.go | 57 ++++++++++++++++++- 3 files changed, 82 insertions(+), 11 deletions(-) diff --git a/pkg/reconciler/pipelinerun/cancel.go b/pkg/reconciler/pipelinerun/cancel.go index aebca6277d8..7d58dab9e9e 100644 --- a/pkg/reconciler/pipelinerun/cancel.go +++ b/pkg/reconciler/pipelinerun/cancel.go @@ -35,14 +35,6 @@ import ( // cancelPipelineRun marks the PipelineRun as cancelled and any resolved TaskRun(s) too. func cancelPipelineRun(logger *zap.SugaredLogger, pr *v1alpha1.PipelineRun, clientSet clientset.Interface) error { - pr.Status.SetCondition(&apis.Condition{ - Type: apis.ConditionSucceeded, - Status: corev1.ConditionFalse, - Reason: "PipelineRunCancelled", - Message: fmt.Sprintf("PipelineRun %q was cancelled", pr.Name), - }) - // update pr completed time - pr.Status.CompletionTime = &metav1.Time{Time: time.Now()} errs := []string{} // Use Patch to update the TaskRuns since the TaskRun controller may be operating on the @@ -62,8 +54,26 @@ func cancelPipelineRun(logger *zap.SugaredLogger, pr *v1alpha1.PipelineRun, clie continue } } - if len(errs) > 0 { - return fmt.Errorf("error(s) from cancelling TaskRun(s) from PipelineRun %s: %s", pr.Name, strings.Join(errs, "\n")) + // If we successfully cancelled all the TaskRuns, we can consider the PipelineRun cancelled. + if len(errs) == 0 { + pr.Status.SetCondition(&apis.Condition{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionFalse, + Reason: ReasonCancelled, + Message: fmt.Sprintf("PipelineRun %q was cancelled", pr.Name), + }) + // update pr completed time + pr.Status.CompletionTime = &metav1.Time{Time: time.Now()} + } else { + e := strings.Join(errs, "\n") + // Indicate that we failed to cancel the PipelineRun + pr.Status.SetCondition(&apis.Condition{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionUnknown, + Reason: ReasonCouldntCancel, + Message: fmt.Sprintf("PipelineRun %q was cancelled but had errors trying to cancel TaskRuns: %s", pr.Name, e), + }) + return fmt.Errorf("error(s) from cancelling TaskRun(s) from PipelineRun %s: %s", pr.Name, e) } return nil } diff --git a/pkg/reconciler/pipelinerun/pipelinerun.go b/pkg/reconciler/pipelinerun/pipelinerun.go index 6d261a1a5b3..44f3f93587d 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun.go +++ b/pkg/reconciler/pipelinerun/pipelinerun.go @@ -82,6 +82,12 @@ const ( // ReasonInvalidGraph indicates that the reason for the failure status is that the // associated Pipeline is an invalid graph (a.k.a wrong order, cycle, …) ReasonInvalidGraph = "PipelineInvalidGraph" + // ReasonCancelled indicates that a PipelineRun was cancelled. + ReasonCancelled = "PipelineRunCancelled" + // ReasonCouldntCancel indicates that a PipelineRun was cancelled but attempting to update + // all of the running TaskRuns as cancelled failed. + ReasonCouldntCancel = "PipelineRunCouldntCancel" + // pipelineRunAgentName defines logging agent name for PipelineRun Controller pipelineRunAgentName = "pipeline-controller" diff --git a/pkg/reconciler/pipelinerun/pipelinerun_test.go b/pkg/reconciler/pipelinerun/pipelinerun_test.go index 302a3ed35c5..28837bb86ed 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun_test.go +++ b/pkg/reconciler/pipelinerun/pipelinerun_test.go @@ -38,6 +38,8 @@ import ( test "github.com/tektoncd/pipeline/test/v1alpha1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + k8stesting "k8s.io/client-go/testing" ktesting "k8s.io/client-go/testing" "knative.dev/pkg/apis" duckv1beta1 "knative.dev/pkg/apis/duck/v1beta1" @@ -1020,6 +1022,59 @@ func TestReconcileWithoutPVC(t *testing.T) { } } +func TestReconcileCancelledFailsTaskRunCancellation(t *testing.T) { + names.TestingSeed() + ptName := "hello-world-1" + prName := "test-pipeline-run-with-timeout" + prs := []*v1alpha1.PipelineRun{tb.PipelineRun(prName, tb.PipelineRunNamespace("foo"), + tb.PipelineRunSpec("test-pipeline", + tb.PipelineRunCancelled, + ), + // The reconciler uses the presense of this TaskRun in the status to determine that a TaskRun + // is already running. The TaskRun will not be retrieved at all so we do not need to seed one. + tb.PipelineRunStatus( + tb.PipelineRunTaskRunsStatus(prName+ptName, &v1alpha1.PipelineRunTaskRunStatus{ + PipelineTaskName: ptName, + Status: &v1alpha1.TaskRunStatus{}, + }), + ), + )} + + d := test.Data{ + PipelineRuns: prs, + } + + testAssets, cancel := getPipelineRunController(t, d) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + + // Make the patch call fail, i.e. make it so that the controller fails to cancel the TaskRun + clients.Pipeline.PrependReactor("patch", "taskruns", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, nil, fmt.Errorf("i'm sorry Dave, i'm afraid i can't do that") + }) + + err := c.Reconciler.Reconcile(context.Background(), "foo/test-pipeline-run-with-timeout") + if err == nil { + t.Errorf("Expected to see error returned from reconcile after failing to cancel TaskRun but saw none!") + } + + // Check that the PipelineRun is still running with correct error message + reconciledRun, err := clients.Pipeline.TektonV1alpha1().PipelineRuns("foo").Get("test-pipeline-run-with-timeout", metav1.GetOptions{}) + if err != nil { + t.Fatalf("Somehow had error getting reconciled run out of fake client: %s", err) + } + + // The PipelineRun should not be cancelled b/c we couldn't cancel the TaskRun + condition := reconciledRun.Status.GetCondition(apis.ConditionSucceeded) + if !condition.IsUnknown() { + t.Errorf("Expected PipelineRun to still be running since the TaskRun could not be cancelled but succeded condition is %v", condition.Status) + } + if condition.Reason != ReasonCouldntCancel { + t.Errorf("Expected PipelineRun condition to indicate the cancellation failed but reason was %s", condition.Reason) + } +} + func TestReconcileCancelledPipelineRun(t *testing.T) { ps := []*v1alpha1.Pipeline{tb.Pipeline("test-pipeline", tb.PipelineNamespace("foo"), tb.PipelineSpec( tb.PipelineTask("hello-world-1", "hello-world", tb.Retries(1)), @@ -1054,7 +1109,7 @@ func TestReconcileCancelledPipelineRun(t *testing.T) { } // The PipelineRun should be still cancelled. - if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason != "PipelineRunCancelled" { + if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason != ReasonCancelled { t.Errorf("Expected PipelineRun to be cancelled, but condition reason is %s", reconciledRun.Status.GetCondition(apis.ConditionSucceeded)) }