Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 1.21 regression: GET_32G_MAX_CONCURRENT + mixed prepared/executing leads to stuck scheduler #10633

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion storage/sealer/sched_assigner_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (a *AssignerCommon) TrySched(sh *Scheduler) {
needRes := worker.Info.Resources.ResourceSpec(task.Sector.ProofType, task.TaskType)

// TODO: allow bigger windows
if !windows[wnd].Allocated.CanHandleRequest(task.SealTask(), needRes, windowRequest.Worker, "schedAcceptable", worker.Info) {
if !windows[wnd].Allocated.CanHandleRequest(task.SchedId, task.SealTask(), needRes, windowRequest.Worker, "schedAcceptable", worker.Info) {
continue
}

Expand Down
4 changes: 2 additions & 2 deletions storage/sealer/sched_assigner_darts.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func RandomWS(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []

log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.Sector.ID.Number, wnd, i)

if !windows[wnd].Allocated.CanHandleRequest(task.SealTask(), res, wid, "schedAssign", w.Info) {
if !windows[wnd].Allocated.CanHandleRequest(task.SchedId, task.SealTask(), res, wid, "schedAssign", w.Info) {
continue
}

Expand Down Expand Up @@ -71,7 +71,7 @@ func RandomWS(sh *Scheduler, queueLen int, acceptableWindows [][]int, windows []
"worker", bestWid,
"choices", len(choices))

windows[selectedWindow].Allocated.Add(task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Allocated.Add(task.SchedId, task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Todo = append(windows[selectedWindow].Todo, task)

rmQueue = append(rmQueue, sqi)
Expand Down
4 changes: 2 additions & 2 deletions storage/sealer/sched_assigner_spread.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func SpreadWS(queued bool) func(sh *Scheduler, queueLen int, acceptableWindows [

log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.Sector.ID.Number, wnd, i)

if !windows[wnd].Allocated.CanHandleRequest(task.SealTask(), res, wid, "schedAssign", w.Info) {
if !windows[wnd].Allocated.CanHandleRequest(task.SchedId, task.SealTask(), res, wid, "schedAssign", w.Info) {
continue
}

Expand Down Expand Up @@ -71,7 +71,7 @@ func SpreadWS(queued bool) func(sh *Scheduler, queueLen int, acceptableWindows [
"assigned", bestAssigned)

workerAssigned[bestWid]++
windows[selectedWindow].Allocated.Add(task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Allocated.Add(task.SchedId, task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Todo = append(windows[selectedWindow].Todo, task)

rmQueue = append(rmQueue, sqi)
Expand Down
4 changes: 2 additions & 2 deletions storage/sealer/sched_assigner_spread_tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func SpreadTasksWS(queued bool) func(sh *Scheduler, queueLen int, acceptableWind

log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.Sector.ID.Number, wnd, i)

if !windows[wnd].Allocated.CanHandleRequest(task.SealTask(), res, wid, "schedAssign", w.Info) {
if !windows[wnd].Allocated.CanHandleRequest(task.SchedId, task.SealTask(), res, wid, "schedAssign", w.Info) {
continue
}

Expand Down Expand Up @@ -80,7 +80,7 @@ func SpreadTasksWS(queued bool) func(sh *Scheduler, queueLen int, acceptableWind
"assigned", bestAssigned)

workerAssigned[bestWid]++
windows[selectedWindow].Allocated.Add(task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Allocated.Add(task.SchedId, task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Todo = append(windows[selectedWindow].Todo, task)

rmQueue = append(rmQueue, sqi)
Expand Down
4 changes: 2 additions & 2 deletions storage/sealer/sched_assigner_utilization.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func LowestUtilizationWS(sh *Scheduler, queueLen int, acceptableWindows [][]int,
log.Debugf("SCHED try assign sqi:%d sector %d to window %d (awi:%d)", sqi, task.Sector.ID.Number, wnd, i)

// TODO: allow bigger windows
if !windows[wnd].Allocated.CanHandleRequest(task.SealTask(), res, wid, "schedAssign", w.Info) {
if !windows[wnd].Allocated.CanHandleRequest(task.SchedId, task.SealTask(), res, wid, "schedAssign", w.Info) {
continue
}

Expand Down Expand Up @@ -82,7 +82,7 @@ func LowestUtilizationWS(sh *Scheduler, queueLen int, acceptableWindows [][]int,
"worker", bestWid,
"utilization", bestUtilization)

workerUtil[bestWid] += windows[selectedWindow].Allocated.Add(task.SealTask(), info.Resources, needRes)
workerUtil[bestWid] += windows[selectedWindow].Allocated.Add(task.SchedId, task.SealTask(), info.Resources, needRes)
windows[selectedWindow].Todo = append(windows[selectedWindow].Todo, task)

rmQueue = append(rmQueue, sqi)
Expand Down
5 changes: 3 additions & 2 deletions storage/sealer/sched_post.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"sync"
"time"

"github.com/google/uuid"
"github.com/hashicorp/go-multierror"
"golang.org/x/xerrors"

Expand Down Expand Up @@ -110,7 +111,7 @@ func (ps *poStScheduler) Schedule(ctx context.Context, primary bool, spt abi.Reg
for i, selected := range candidates {
worker := ps.workers[selected.id]

err := worker.active.withResources(selected.id, worker.Info, ps.postType.SealTask(spt), selected.res, &ps.lk, func() error {
err := worker.active.withResources(uuid.UUID{}, selected.id, worker.Info, ps.postType.SealTask(spt), selected.res, &ps.lk, func() error {
ps.lk.Unlock()
defer ps.lk.Lock()

Expand Down Expand Up @@ -148,7 +149,7 @@ func (ps *poStScheduler) readyWorkers(spt abi.RegisteredSealProof) (bool, []cand
continue
}

if !wr.active.CanHandleRequest(ps.postType.SealTask(spt), needRes, wid, "post-readyWorkers", wr.Info) {
if !wr.active.CanHandleRequest(uuid.UUID{}, ps.postType.SealTask(spt), needRes, wid, "post-readyWorkers", wr.Info) {
continue
}

Expand Down
56 changes: 34 additions & 22 deletions storage/sealer/sched_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package sealer
import (
"sync"

"github.com/google/uuid"

"github.com/filecoin-project/lotus/storage/sealer/sealtasks"
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)
Expand All @@ -20,7 +22,7 @@ type ActiveResources struct {
}

type taskCounter struct {
taskCounters map[sealtasks.SealTaskType]int
taskCounters map[sealtasks.SealTaskType]map[uuid.UUID]bool

// this lock is technically redundant, as ActiveResources is always accessed
// with the worker lock, but let's not panic if we ever change that
Expand All @@ -29,43 +31,52 @@ type taskCounter struct {

func newTaskCounter() *taskCounter {
return &taskCounter{
taskCounters: map[sealtasks.SealTaskType]int{},
taskCounters: make(map[sealtasks.SealTaskType]map[uuid.UUID]bool),
}
}

func (tc *taskCounter) Add(tt sealtasks.SealTaskType) {
func (tc *taskCounter) Add(tt sealtasks.SealTaskType, schedID uuid.UUID) {
tc.lk.Lock()
defer tc.lk.Unlock()
tc.taskCounters[tt]++
tc.getUnlocked(tt)[schedID] = true
}

func (tc *taskCounter) Free(tt sealtasks.SealTaskType) {
func (tc *taskCounter) Free(tt sealtasks.SealTaskType, schedID uuid.UUID) {
tc.lk.Lock()
defer tc.lk.Unlock()
tc.taskCounters[tt]--
delete(tc.getUnlocked(tt), schedID)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I believe this has a bug which makes it not work as we'd expect:

  • First we call .Add for preparing
  • Then we call .Add on active in .withResources
  • In .withResources we call .Free on preparing
  • Now the task runs - and we don't count the resources
  • Then we call .Free on preparing

This can be fixed by swapping map[sealtasks.SealTaskType]map[uuid.UUID]bool for map[sealtasks.SealTaskType]map[uuid.UUID]int and count how many times Add/Free was called.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah its allowing more than it should as-is.
Trying that and counting +1/-1 for each Add/Free invocation leads to wrong stuck behavior again:

ID        Sector  Worker    Hostname    Task  State        Time
00000000  23      4154622e  x  GET   assigned(1)  21.2s
00000000  24      4154622e  x  GET   assigned(2)  21.2s

Thats interesting, since only reasonable idea I have is that there's an invocation missing somewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@magik6k
Apparently the issue is the last Free for a SchedId happens after the last WINDOW assignPreparingWork
(which in turn says not scheduling on worker for startPreparing).
A quick test of returning update=true in schedWorker::waitForUpdates to force a regular invocation of that leads to the "GET backlog" getting fully resolved over time as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@magik6k Any update on this? This is now broken in 1.21, 1.22 and 1.23.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only reasonable idea I have is that there's an invocation missing somewhere.

How did you implement Free with the counter? Did you delete from the map when the counter reached zero? If not the tasks := a.taskCounters.Get(tt) [...] if len(tasks) >= needRes.MaxConcurrent below would not work correctly as it still counted the zero enties.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@magik6k Yeah I've accounted for that:
steffengy@a3b7ec2
(I think else it also wouldnt work with making "scheduler entry" more often by returning update=true in schedWorker::waitForUpdates as that just evaluates the same at a later time; so its a timing/ordering issue last Free happening after the next scheduler entry for startPreparing and stuck there until next scheduling happening)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've spent some time playing with it (in #10850), and I think I found the issue with integer counters - 240fc6d

Would be great if you could see if it works for you

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! Yeah in a quick test that looks working and fix-wise it seems plausible that that was the issue.
Closing this PR in favor of yours.

}

func (tc *taskCounter) getUnlocked(tt sealtasks.SealTaskType) map[uuid.UUID]bool {
if tc.taskCounters[tt] == nil {
tc.taskCounters[tt] = make(map[uuid.UUID]bool)
}

return tc.taskCounters[tt]
}

func (tc *taskCounter) Get(tt sealtasks.SealTaskType) int {
func (tc *taskCounter) Get(tt sealtasks.SealTaskType) map[uuid.UUID]bool {
tc.lk.Lock()
defer tc.lk.Unlock()
return tc.taskCounters[tt]

return tc.getUnlocked(tt)
}

func (tc *taskCounter) Sum() int {
tc.lk.Lock()
defer tc.lk.Unlock()
sum := 0
for _, v := range tc.taskCounters {
sum += v
sum += len(v)
}
return sum
}

func (tc *taskCounter) ForEach(cb func(tt sealtasks.SealTaskType, count int)) {
tc.lk.Lock()
defer tc.lk.Unlock()
for tt, count := range tc.taskCounters {
cb(tt, count)
for tt, v := range tc.taskCounters {
cb(tt, len(v))
}
}

Expand All @@ -75,8 +86,8 @@ func NewActiveResources(tc *taskCounter) *ActiveResources {
}
}

func (a *ActiveResources) withResources(id storiface.WorkerID, wr storiface.WorkerInfo, tt sealtasks.SealTaskType, r storiface.Resources, locker sync.Locker, cb func() error) error {
for !a.CanHandleRequest(tt, r, id, "withResources", wr) {
func (a *ActiveResources) withResources(schedID uuid.UUID, id storiface.WorkerID, wr storiface.WorkerInfo, tt sealtasks.SealTaskType, r storiface.Resources, locker sync.Locker, cb func() error) error {
for !a.CanHandleRequest(schedID, tt, r, id, "withResources", wr) {
if a.cond == nil {
a.cond = sync.NewCond(locker)
}
Expand All @@ -85,11 +96,11 @@ func (a *ActiveResources) withResources(id storiface.WorkerID, wr storiface.Work
a.waiting--
}

a.Add(tt, wr.Resources, r)
a.Add(schedID, tt, wr.Resources, r)

err := cb()

a.Free(tt, wr.Resources, r)
a.Free(schedID, tt, wr.Resources, r)

return err
}
Expand All @@ -100,7 +111,7 @@ func (a *ActiveResources) hasWorkWaiting() bool {
}

// add task resources to ActiveResources and return utilization difference
func (a *ActiveResources) Add(tt sealtasks.SealTaskType, wr storiface.WorkerResources, r storiface.Resources) float64 {
func (a *ActiveResources) Add(schedID uuid.UUID, tt sealtasks.SealTaskType, wr storiface.WorkerResources, r storiface.Resources) float64 {
startUtil := a.utilization(wr)

if r.GPUUtilization > 0 {
Expand All @@ -109,19 +120,19 @@ func (a *ActiveResources) Add(tt sealtasks.SealTaskType, wr storiface.WorkerReso
a.cpuUse += r.Threads(wr.CPUs, len(wr.GPUs))
a.memUsedMin += r.MinMemory
a.memUsedMax += r.MaxMemory
a.taskCounters.Add(tt)
a.taskCounters.Add(tt, schedID)

return a.utilization(wr) - startUtil
}

func (a *ActiveResources) Free(tt sealtasks.SealTaskType, wr storiface.WorkerResources, r storiface.Resources) {
func (a *ActiveResources) Free(schedID uuid.UUID, tt sealtasks.SealTaskType, wr storiface.WorkerResources, r storiface.Resources) {
if r.GPUUtilization > 0 {
a.gpuUsed -= r.GPUUtilization
}
a.cpuUse -= r.Threads(wr.CPUs, len(wr.GPUs))
a.memUsedMin -= r.MinMemory
a.memUsedMax -= r.MaxMemory
a.taskCounters.Free(tt)
a.taskCounters.Free(tt, schedID)

if a.cond != nil {
a.cond.Broadcast()
Expand All @@ -130,9 +141,10 @@ func (a *ActiveResources) Free(tt sealtasks.SealTaskType, wr storiface.WorkerRes

// CanHandleRequest evaluates if the worker has enough available resources to
// handle the request.
func (a *ActiveResources) CanHandleRequest(tt sealtasks.SealTaskType, needRes storiface.Resources, wid storiface.WorkerID, caller string, info storiface.WorkerInfo) bool {
func (a *ActiveResources) CanHandleRequest(schedID uuid.UUID, tt sealtasks.SealTaskType, needRes storiface.Resources, wid storiface.WorkerID, caller string, info storiface.WorkerInfo) bool {
if needRes.MaxConcurrent > 0 {
if a.taskCounters.Get(tt) >= needRes.MaxConcurrent {
tasks := a.taskCounters.Get(tt)
if len(tasks) >= needRes.MaxConcurrent && (schedID == uuid.UUID{} || !tasks[schedID]) {
log.Debugf("sched: not scheduling on worker %s for %s; at task limit tt=%s, curcount=%d", wid, caller, tt, a.taskCounters.Get(tt))
return false
}
Expand Down Expand Up @@ -226,7 +238,7 @@ func (a *ActiveResources) taskCount(tt *sealtasks.SealTaskType) int {
return a.taskCounters.Sum()
}

return a.taskCounters.Get(*tt)
return len(a.taskCounters.Get(*tt))
}

func (wh *WorkerHandle) Utilization() float64 {
Expand Down
4 changes: 2 additions & 2 deletions storage/sealer/sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ func TestWindowCompact(t *testing.T) {
TaskType: task,
Sector: storiface.SectorRef{ProofType: spt},
})
window.Allocated.Add(task.SealTask(spt), wh.Info.Resources, storiface.ResourceTable[task][spt])
window.Allocated.Add(uuid.UUID{}, task.SealTask(spt), wh.Info.Resources, storiface.ResourceTable[task][spt])
}

wh.activeWindows = append(wh.activeWindows, window)
Expand All @@ -717,7 +717,7 @@ func TestWindowCompact(t *testing.T) {

for ti, task := range tasks {
require.Equal(t, task, wh.activeWindows[wi].Todo[ti].TaskType, "%d, %d", wi, ti)
expectRes.Add(task.SealTask(spt), wh.Info.Resources, storiface.ResourceTable[task][spt])
expectRes.Add(uuid.UUID{}, task.SealTask(spt), wh.Info.Resources, storiface.ResourceTable[task][spt])
}

require.Equal(t, expectRes.cpuUse, wh.activeWindows[wi].Allocated.cpuUse, "%d", wi)
Expand Down
22 changes: 11 additions & 11 deletions storage/sealer/sched_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,14 @@ func (sw *schedWorker) workerCompactWindows() {

for ti, todo := range window.Todo {
needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType)
if !lower.Allocated.CanHandleRequest(todo.SealTask(), needRes, sw.wid, "compactWindows", worker.Info) {
if !lower.Allocated.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "compactWindows", worker.Info) {
continue
}

moved = append(moved, ti)
lower.Todo = append(lower.Todo, todo)
lower.Allocated.Add(todo.SealTask(), worker.Info.Resources, needRes)
window.Allocated.Free(todo.SealTask(), worker.Info.Resources, needRes)
lower.Allocated.Add(todo.SchedId, todo.SealTask(), worker.Info.Resources, needRes)
window.Allocated.Free(todo.SchedId, todo.SealTask(), worker.Info.Resources, needRes)
}

if len(moved) > 0 {
Expand Down Expand Up @@ -355,7 +355,7 @@ assignLoop:
worker.lk.Lock()
for t, todo := range firstWindow.Todo {
needResPrep := worker.Info.Resources.PrepResourceSpec(todo.Sector.ProofType, todo.TaskType, todo.prepare.PrepType)
if worker.preparing.CanHandleRequest(todo.PrepSealTask(), needResPrep, sw.wid, "startPreparing", worker.Info) {
if worker.preparing.CanHandleRequest(todo.SchedId, todo.PrepSealTask(), needResPrep, sw.wid, "startPreparing", worker.Info) {
tidx = t
break
}
Expand Down Expand Up @@ -416,7 +416,7 @@ assignLoop:
}

needRes := worker.Info.Resources.ResourceSpec(todo.Sector.ProofType, todo.TaskType)
if worker.active.CanHandleRequest(todo.SealTask(), needRes, sw.wid, "startPreparing", worker.Info) {
if worker.active.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "startPreparing", worker.Info) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR, but should be

Suggested change
if worker.active.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "startPreparing", worker.Info) {
if worker.active.CanHandleRequest(todo.SchedId, todo.SealTask(), needRes, sw.wid, "startReady", worker.Info) {

tidx = t
break
}
Expand Down Expand Up @@ -457,7 +457,7 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error {
needResPrep := w.Info.Resources.PrepResourceSpec(req.Sector.ProofType, req.TaskType, req.prepare.PrepType)

w.lk.Lock()
w.preparing.Add(req.PrepSealTask(), w.Info.Resources, needResPrep)
w.preparing.Add(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep)
w.lk.Unlock()

go func() {
Expand All @@ -468,7 +468,7 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error {
w.lk.Lock()

if err != nil {
w.preparing.Free(req.PrepSealTask(), w.Info.Resources, needResPrep)
w.preparing.Free(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep)
w.lk.Unlock()

select {
Expand Down Expand Up @@ -497,8 +497,8 @@ func (sw *schedWorker) startProcessingTask(req *WorkerRequest) error {
}()

// wait (if needed) for resources in the 'active' window
err = w.active.withResources(sw.wid, w.Info, req.SealTask(), needRes, &w.lk, func() error {
w.preparing.Free(req.PrepSealTask(), w.Info.Resources, needResPrep)
err = w.active.withResources(req.SchedId, sw.wid, w.Info, req.SealTask(), needRes, &w.lk, func() error {
w.preparing.Free(req.SchedId, req.PrepSealTask(), w.Info.Resources, needResPrep)
w.lk.Unlock()
defer w.lk.Lock() // we MUST return locked from this function

Expand Down Expand Up @@ -539,7 +539,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *WorkerRequest) error {

needRes := w.Info.Resources.ResourceSpec(req.Sector.ProofType, req.TaskType)

w.active.Add(req.SealTask(), w.Info.Resources, needRes)
w.active.Add(req.SchedId, req.SealTask(), w.Info.Resources, needRes)

go func() {
// Do the work!
Expand All @@ -557,7 +557,7 @@ func (sw *schedWorker) startProcessingReadyTask(req *WorkerRequest) error {

w.lk.Lock()

w.active.Free(req.SealTask(), w.Info.Resources, needRes)
w.active.Free(req.SchedId, req.SealTask(), w.Info.Resources, needRes)

select {
case sw.taskDone <- struct{}{}:
Expand Down