diff --git a/engine/service/types.go b/engine/service/types.go index 4edd404078..d554933bf3 100644 --- a/engine/service/types.go +++ b/engine/service/types.go @@ -67,6 +67,7 @@ type HatcheryCommonConfiguration struct { ExtraValue string `toml:"extraValue" comment:"value for extraKey field. For many keys: valueaaa,valuebbb" json:"-"` } `toml:"graylog" json:"graylog"` } `toml:"workerLogsOptions" comment:"Worker Log Configuration" json:"workerLogsOptions"` + MaxAttemptsNumberBeforeFailure int `toml:"maxAttemptsNumberBeforeFailure" default:"5" commented:"true" comment:"Maximum attempts to start a same job. -1 to disable failing jobs when to many attempts" json:"maxAttemptsNumberBeforeFailure"` } `toml:"provision" json:"provision"` LogOptions struct { SpawnOptions struct { diff --git a/sdk/error.go b/sdk/error.go index 51833cb38c..bbd4b3eb87 100644 --- a/sdk/error.go +++ b/sdk/error.go @@ -200,6 +200,7 @@ var ( ErrConflictData = Error{ID: 192, Status: http.StatusConflict} ErrWebsocketUpgrade = Error{ID: 193, Status: http.StatusUpgradeRequired} ErrMFARequired = Error{ID: 194, Status: http.StatusForbidden} + ErrHatcheryNoResourceAvailable = Error{ID: 195, Status: http.StatusInternalServerError} ) var errorsAmericanEnglish = map[int]string{ @@ -383,6 +384,7 @@ var errorsAmericanEnglish = map[int]string{ ErrConflictData.ID: "Data conflict", ErrWebsocketUpgrade.ID: "Websocket upgrade required", ErrMFARequired.ID: "Multi factor authentication is required", + ErrHatcheryNoResourceAvailable.ID: "No enough resource available to start worker", } // Error type. diff --git a/sdk/hatchery/hatchery.go b/sdk/hatchery/hatchery.go index fd23d64db1..8dadf4c4a4 100644 --- a/sdk/hatchery/hatchery.go +++ b/sdk/hatchery/hatchery.go @@ -20,12 +20,12 @@ import ( var ( // Client is a CDS Client - Client cdsclient.HTTPClient - defaultMaxProvisioning = 10 - models []sdk.Model - MaxAttemptsNumberBeforeFailure = 5 - CacheSpawnIDsTTL = 10 * time.Second - CacheNbAttemptsIDsTTL = 1 * time.Hour + Client cdsclient.HTTPClient + defaultMaxProvisioning = 10 + models []sdk.Model + defaultMaxAttemptsNumberBeforeFailure = 5 + CacheSpawnIDsTTL = 10 * time.Second + CacheNbAttemptsIDsTTL = 1 * time.Hour ) type CacheNbAttemptsJobIDs struct { @@ -87,7 +87,7 @@ func Create(ctx context.Context, h Interface) error { // Create a cache to keep in memory the jobID processed in the last 10s. cacheSpawnIDs := cache.New(CacheSpawnIDsTTL, 2*CacheSpawnIDsTTL) - // Create a cache to only process each jobID only a number of attenmpts before force to fail the job + // Create a cache to only process each jobID only a number of attempts before force to fail the job cacheNbAttemptsIDs := &CacheNbAttemptsJobIDs{ cache: cache.New(CacheNbAttemptsIDsTTL, 2*CacheNbAttemptsIDsTTL), } @@ -226,26 +226,6 @@ func Create(ctx context.Context, h Interface) error { continue } - //Check if we already try to start a worker for this job - nbAttempts := cacheNbAttemptsIDs.NewAttempt(j.ID) - if nbAttempts > MaxAttemptsNumberBeforeFailure { - if err := h.CDSClient(). - QueueSendResult(ctx, - j.ID, - sdk.Result{ - ID: j.ID, - BuildID: j.ID, - Status: sdk.StatusFail, - RemoteTime: time.Now(), - Reason: fmt.Sprintf("hatchery %q failed to start worker after %d attempts", h.Configuration().Name, MaxAttemptsNumberBeforeFailure), - }); err != nil { - log.ErrorWithStackTrace(ctx, err) - } - log.Info(ctx, "hatchery %q failed to start worker after %d attempts", h.Configuration().Name, MaxAttemptsNumberBeforeFailure) - endTrace("maximum attempts") - continue - } - workerRequest := workerStarterRequest{ ctx: currentCtx, cancel: endTrace, @@ -313,6 +293,32 @@ func Create(ctx context.Context, h Interface) error { } } + // Check if we already try to start a worker for this job + maxAttemptsNumberBeforeFailure := h.Configuration().Provision.MaxAttemptsNumberBeforeFailure + if maxAttemptsNumberBeforeFailure > -1 { + nbAttempts := cacheNbAttemptsIDs.NewAttempt(j.ID) + if maxAttemptsNumberBeforeFailure == 0 { + maxAttemptsNumberBeforeFailure = defaultMaxAttemptsNumberBeforeFailure + } + if nbAttempts > maxAttemptsNumberBeforeFailure { + if err := h.CDSClient(). + QueueSendResult(ctx, + j.ID, + sdk.Result{ + ID: j.ID, + BuildID: j.ID, + Status: sdk.StatusFail, + RemoteTime: time.Now(), + Reason: fmt.Sprintf("hatchery %q failed to start worker after %d attempts", h.Configuration().Name, maxAttemptsNumberBeforeFailure), + }); err != nil { + log.ErrorWithStackTrace(ctx, err) + } + log.Info(ctx, "hatchery %q failed to start worker after %d attempts", h.Configuration().Name, maxAttemptsNumberBeforeFailure) + endTrace("maximum attempts") + continue + } + } + //Ask to start log.Debug(ctx, "hatchery> Request a worker for job %d (%.3f seconds elapsed)", j.ID, time.Since(t0).Seconds()) workersStartChan <- workerRequest diff --git a/sdk/hatchery/hatchery_test.go b/sdk/hatchery/hatchery_test.go index fa462d6b81..fb17189984 100644 --- a/sdk/hatchery/hatchery_test.go +++ b/sdk/hatchery/hatchery_test.go @@ -40,8 +40,8 @@ func TestCreate(t *testing.T) { } hatcheryConfig.Provision.MaxWorker = 1 - hatchery.CacheSpawnIDsTTL = 2 * time.Second // decrease this cache TTL to speedup the test - hatchery.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test + hatchery.CacheSpawnIDsTTL = 2 * time.Second // decrease this cache TTL to speedup the test + hatcheryConfig.Provision.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test mockHatchery.EXPECT().Name().Return(t.Name()).AnyTimes() mockHatchery.EXPECT().Type().Return(sdk.TypeHatchery).AnyTimes() diff --git a/sdk/hatchery/starter.go b/sdk/hatchery/starter.go index 95710226b3..5cf9cec554 100644 --- a/sdk/hatchery/starter.go +++ b/sdk/hatchery/starter.go @@ -205,7 +205,7 @@ func spawnWorkerForJob(ctx context.Context, h Interface, j workerStarterRequest) ID: sdk.MsgSpawnInfoHatcheryErrorSpawn.ID, Args: []interface{}{h.Service().Name, modelName, sdk.Round(time.Since(start), time.Second).String(), sdk.ExtractHTTPError(errSpawn).Error()}, }) - log.Error(ctx, "hatchery %s cannot spawn worker %s for job %d: %v", h.Service().Name, modelName, j.id, errSpawn) + log.ErrorWithStackTrace(ctx, sdk.WrapError(errSpawn, "hatchery %s cannot spawn worker %s for job %d", h.Service().Name, modelName, j.id)) next() return false } diff --git a/sdk/messages.go b/sdk/messages.go index a30544317e..ff76d4a9e2 100644 --- a/sdk/messages.go +++ b/sdk/messages.go @@ -59,7 +59,7 @@ var ( MsgEnvironmentDetached = &Message{"MsgEnvironmentDetached", trad{FR: "L'environnement %s est détaché du repository %s", EN: "The environment %s is detached from repository %s"}, nil, RunInfoTypInfo} MsgWorkflowDetached = &Message{"MsgWorkflowDetached", trad{FR: "Le workflow %s est détaché du repository %s", EN: "The workflow %s is detached from repository %s"}, nil, RunInfoTypInfo} MsgSpawnInfoHatcheryStarts = &Message{"MsgSpawnInfoHatcheryStarts", trad{FR: "La Hatchery %s a démarré le lancement du worker avec le modèle %s", EN: "Hatchery %s starts spawn worker with model %s"}, nil, RunInfoTypInfo} - MsgSpawnInfoHatcheryErrorSpawn = &Message{"MsgSpawnInfoHatcheryErrorSpawn", trad{FR: "Une erreur est survenue lorsque la Hatchery %s a démarré un worker avec le modèle %s après %s, err:%s", EN: "Error while Hatchery %s spawns worker with model %s after %s, err:%s"}, nil, RunInfoTypeError} + MsgSpawnInfoHatcheryErrorSpawn = &Message{"MsgSpawnInfoHatcheryErrorSpawn", trad{FR: "Une erreur est survenue lorsque la Hatchery %s a démarré un worker avec le modèle %s après %s, err: %s", EN: "Error while Hatchery %s spawns worker with model %s after %s, err: %s"}, nil, RunInfoTypeError} MsgSpawnInfoHatcheryStartDockerPull = &Message{"MsgSpawnInfoHatcheryStartDockerPull", trad{FR: "La Hatchery %s a démarré le docker pull de l'image %s...", EN: "Hatchery %s starts docker pull %s..."}, nil, RunInfoTypInfo} MsgSpawnInfoHatcheryEndDockerPull = &Message{"MsgSpawnInfoHatcheryEndDockerPull", trad{FR: "La Hatchery %s a terminé le docker pull de l'image %s", EN: "Hatchery %s docker pull %s done"}, nil, RunInfoTypInfo} MsgSpawnInfoHatcheryEndDockerPullErr = &Message{"MsgSpawnInfoHatcheryEndDockerPullErr", trad{FR: "⚠ La Hatchery %s a terminé le docker pull de l'image %s en erreur: %s", EN: "⚠ Hatchery %s - docker pull %s done with error: %v"}, nil, RunInfoTypeError}