-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix 1.21 regression: GET_32G_MAX_CONCURRENT + mixed prepared/executing leads to stuck scheduler #10633
Closed
Closed
Fix 1.21 regression: GET_32G_MAX_CONCURRENT + mixed prepared/executing leads to stuck scheduler #10633
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -294,14 +294,14 @@ func (sw *schedWorker) workerCompactWindows() { | |||||
|
||||||
for ti, todo := range window.Todo { | ||||||
needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType) | ||||||
if !lower.Allocated.CanHandleRequest(todo.SealTask(), needRes, sw.wid, "compactWindows", worker.Info) { | ||||||
if !lower.Allocated.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "compactWindows", worker.Info) { | ||||||
continue | ||||||
} | ||||||
|
||||||
moved = append(moved, ti) | ||||||
lower.Todo = append(lower.Todo, todo) | ||||||
lower.Allocated.Add(todo.SealTask(), worker.Info.Resources, needRes) | ||||||
window.Allocated.Free(todo.SealTask(), worker.Info.Resources, needRes) | ||||||
lower.Allocated.Add(todo.SchedId, todo.SealTask(), worker.Info.Resources, needRes) | ||||||
window.Allocated.Free(todo.SchedId, todo.SealTask(), worker.Info.Resources, needRes) | ||||||
} | ||||||
|
||||||
if len(moved) > 0 { | ||||||
|
@@ -355,7 +355,7 @@ assignLoop: | |||||
worker.lk.Lock() | ||||||
for t, todo := range firstWindow.Todo { | ||||||
needResPrep := worker.Info.Resources.PrepResourceSpec(todo.Sector.ProofType, todo.TaskType, todo.prepare.PrepType) | ||||||
if worker.preparing.CanHandleRequest(todo.PrepSealTask(), needResPrep, sw.wid, "startPreparing", worker.Info) { | ||||||
if worker.preparing.CanHandleRequest(todo.SchedId, todo.PrepSealTask(), needResPrep, sw.wid, "startPreparing", worker.Info) { | ||||||
tidx = t | ||||||
break | ||||||
} | ||||||
|
@@ -416,7 +416,7 @@ assignLoop: | |||||
} | ||||||
|
||||||
needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType) | ||||||
if worker.active.CanHandleRequest(todo.SealTask(), needRes, sw.wid, "startPreparing", worker.Info) { | ||||||
if worker.active.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "startPreparing", worker.Info) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not related to this PR, but should be
Suggested change
|
||||||
tidx = t | ||||||
break | ||||||
} | ||||||
|
@@ -457,7 +457,7 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error { | |||||
needResPrep := w.Info.Resources.PrepResourceSpec(req.Sector.ProofType, req.TaskType, req.prepare.PrepType) | ||||||
|
||||||
w.lk.Lock() | ||||||
w.preparing.Add(req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
w.preparing.Add(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
w.lk.Unlock() | ||||||
|
||||||
go func() { | ||||||
|
@@ -468,7 +468,7 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error { | |||||
w.lk.Lock() | ||||||
|
||||||
if err != nil { | ||||||
w.preparing.Free(req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
w.preparing.Free(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
w.lk.Unlock() | ||||||
|
||||||
select { | ||||||
|
@@ -497,8 +497,8 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error { | |||||
}() | ||||||
|
||||||
// wait (if needed) for resources in the 'active' window | ||||||
err = w.active.withResources(sw.wid, w.Info, req.SealTask(), needRes, &w.lk, func() error { | ||||||
w.preparing.Free(req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
err = w.active.withResources(req.SchedId, sw.wid, w.Info, req.SealTask(), needRes, &w.lk, func() error { | ||||||
w.preparing.Free(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep) | ||||||
w.lk.Unlock() | ||||||
defer w.lk.Lock() // we MUST return locked from this function | ||||||
|
||||||
|
@@ -539,7 +539,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *WorkerRequest) error { | |||||
|
||||||
needRes := w.Info.Resources.ResourceSpec(req.Sector.ProofType, req.TaskType) | ||||||
|
||||||
w.active.Add(req.SealTask(), w.Info.Resources, needRes) | ||||||
w.active.Add(req.SchedId, req.SealTask(), w.Info.Resources, needRes) | ||||||
|
||||||
go func() { | ||||||
// Do the work! | ||||||
|
@@ -557,7 +557,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *WorkerRequest) error { | |||||
|
||||||
w.lk.Lock() | ||||||
|
||||||
w.active.Free(req.SealTask(), w.Info.Resources, needRes) | ||||||
w.active.Free(req.SchedId, req.SealTask(), w.Info.Resources, needRes) | ||||||
|
||||||
select { | ||||||
case sw.taskDone <- struct{}{}: | ||||||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I believe this has a bug which makes it not work as we'd expect:
This can be fixed by swapping
map[sealtasks.SealTaskType]map[uuid.UUID]bool
formap[sealtasks.SealTaskType]map[uuid.UUID]int
and count how many times Add/Free was called.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah its allowing more than it should as-is.
Trying that and counting +1/-1 for each Add/Free invocation leads to wrong stuck behavior again:
Thats interesting, since only reasonable idea I have is that there's an invocation missing somewhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@magik6k
Apparently the issue is the last Free for a SchedId happens after the last
WINDOW assignPreparingWork
(which in turn says
not scheduling on worker for startPreparing
).A quick test of returning
update=true
inschedWorker::waitForUpdates
to force a regular invocation of that leads to the "GET backlog" getting fully resolved over time as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@magik6k Any update on this? This is now broken in 1.21, 1.22 and 1.23.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How did you implement
Free
with the counter? Did youdelete
from the map when the counter reached zero? If not thetasks := a.taskCounters.Get(tt) [...] if len(tasks) >= needRes.MaxConcurrent
below would not work correctly as it still counted the zero enties.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@magik6k Yeah I've accounted for that:
steffengy@a3b7ec2
(I think else it also wouldnt work with making "scheduler entry" more often by returning
update=true
inschedWorker::waitForUpdates
as that just evaluates the same at a later time; so its a timing/ordering issue last Free happening after the next scheduler entry for startPreparing and stuck there until next scheduling happening)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've spent some time playing with it (in #10850), and I think I found the issue with integer counters - 240fc6d
Would be great if you could see if it works for you
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great! Yeah in a quick test that looks working and fix-wise it seems plausible that that was the issue.
Closing this PR in favor of yours.