Skip to content

Commit c713ab9

Browse files
committed
[release-branch.go1.25] runtime: add runtime.Yield
Change-Id: Idbe3438f5f06cae82dc5dcc56c52347d20e3e20a
1 parent 9bec28d commit c713ab9

File tree

4 files changed

+277
-3
lines changed

4 files changed

+277
-3
lines changed

src/context/context.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ import (
6262
"sync"
6363
"sync/atomic"
6464
"time"
65-
_ "unsafe" // for go:linkname
65+
_ "unsafe" // for go:linkname
6666
)
6767

6868
// A Context carries a deadline, a cancellation signal, and other values across
@@ -372,6 +372,7 @@ type stopCtx struct {
372372
var goroutines atomic.Int32
373373

374374
// &cancelCtxKey is the key that a cancelCtx returns itself for.
375+
//
375376
//go:linkname cancelCtxKey
376377
var cancelCtxKey int
377378

src/runtime/proc.go

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,215 @@ func Gosched() {
389389
mcall(gosched_m)
390390
}
391391

392+
// Yield cooperatively yields if, and only if, the scheduler is "busy".
393+
//
394+
// This can be called by any work wishing to utilize strictly spare capacity
395+
// while minimizing the degree to which it delays other work from being promptly
396+
// scheduled.
397+
//
398+
// Yield is intended to have very low overhead, particularly in its no-op case
399+
// where there is idle capacity in the scheduler and the caller does not need to
400+
// yield. This should allow it to be called often, such as in the body of tight
401+
// loops, in any tasks wishing to yield promptly to any waiting work.
402+
//
403+
// When there is waiting work, the yielding goroutine may briefly be rescheduled
404+
// after it, or may, in some cases, be parked in a waiting 'yield' state until
405+
// the scheduler next has spare capacity to resume it. Yield does not guarantee
406+
// fairness or starvation-prevention: once a goroutine Yields(), it may remain
407+
// parked until the scheduler next has idle capacity. This means Yield can block
408+
// for unbounded durations in the presence of sustained over-saturation; callers
409+
// are responsible for deciding where to Yield() to avoid priority inversions.
410+
//
411+
// Yield will never park if the calling goroutine is locked to an OS thread.
412+
func Yield() {
413+
// Common/fast case: do nothing if npidle is non-zero meaning there is
414+
// an idle P so no reason to yield this one. Doing only this check here keeps
415+
// Yield inlineable (~70 of 80 as of writing).
416+
if sched.npidle.Load() == 0 {
417+
maybeYield()
418+
}
419+
}
420+
421+
// maybeYield is called by Yield if npidle is zero, meaning there are no idle Ps
422+
// and thus there may be work to which the caller should yield. Such work could
423+
// be on this local runq of the caller's P, on the global runq, in the runq of
424+
// some other P, or even in the form of ready conns waiting to be noticed by a
425+
// netpoll which would then ready runnable goroutines.
426+
//
427+
// Keeping this function extremely cheap is essential: it must be cheap enough
428+
// that callers can call it in very tight loops, as very frequent calls ensure a
429+
// task wishing to yield when work is waiting will do so promptly. Checking the
430+
// runq of every P or calling netpoll are too expensive to do in every call, so
431+
// given intent is to bound how long work may wait, such checks only need to be
432+
// performed after some amount of time has elapsed (e.g. 0.25ms). To minimize
433+
// overhead when called at a higher frequency, this elapsed time is checked with
434+
// an exponential backoff.
435+
//
436+
// runqs are checked directly with non-atomic reads rather than runqempty: being
437+
// cheap is our top priority and a microsecond of staleness is fine as long as
438+
// the check does not get optimized out of a calling loop body (hence noinline).
439+
//
440+
//go:noinline
441+
func maybeYield() {
442+
gp := getg()
443+
444+
// Don't park while locked to an OS thread.
445+
if gp.lockedm != 0 {
446+
return
447+
}
448+
449+
// If the local P's runq ring buffer/next is non-empty, yield to waiting G.
450+
if p := gp.m.p.ptr(); p.runqhead != p.runqtail || p.runnext != 0 {
451+
// If there is work in the local P's runq, we can yield by just going to the
452+
// back of the local P's runq via goyield: this achieves the same goal of
453+
// letting waiting work run instead of us, but without parking on the global
454+
// yieldq and potentially switching Ps. While that's our preferred choice,
455+
// we want to avoid thrashing back and forth between multiple Yield-calling
456+
// goroutines: in such a case it is better to just park one of them on the
457+
// global yieldq so the other stops seeing it in the P's runq and yielding
458+
// to it.
459+
//
460+
// To detect and break such thrashing, we set yieldchecks to 1 before
461+
// calling goyield. If the other goroutine yields right back and we return
462+
// here still seeing yieldchecks==1, that indicates thrashing, so we park
463+
// instead. 1 is a valid packed prev+count value, with prev=0/count=1 so if
464+
// we later call Yield with no local runq and fall through to the
465+
// maybe-do-expensive-checks code below which will just increment it as
466+
// usual; when count=3 it will compare `prev=0` to the clock and do a check.
467+
if gp.yieldchecks == 1 {
468+
yieldPark()
469+
return
470+
}
471+
gp.yieldchecks = 1
472+
// Go to the back of the local runq.
473+
goyield()
474+
return
475+
}
476+
477+
// If the global runq is non-empty, park in the global yieldq right away: that
478+
// is work someone needs to pick up and it might as well be our P. We could,
479+
// potentially, directly claim it here and goyield or equivalently to try to
480+
// remain on this P, but just parking and letting this P go to findRunnable
481+
// avoids duplication of its logic and seems good enough.
482+
if !sched.runq.empty() {
483+
yieldPark()
484+
return
485+
}
486+
487+
// We didn't find anything via cheap O(1) checks of our runq or global runq
488+
// but it is possible there are goroutines waiting in runqs of other Ps that
489+
// are not being stolen by an idle P since the lack of idle Ps (npidle=0) is
490+
// what got us here. Furthermore, given the lack of idle Ps, it is also
491+
// possible that ready conns are waiting for a netpoll to notice them and
492+
// ready their goroutines i.e. work to which we should then yield.
493+
//
494+
// Searching all runqs, and even more so netpoll, is too expensive for every
495+
// maybeYield call. Since our goal is to bound how long work could wait for
496+
// yield-willing work to yeild to it what we instead can do is perform these
497+
// more expensive checks when enough time has passed. We define "enough" as
498+
// approximately 0.25ms: long enough to keep overhead paid by yield-willing
499+
// work low enough that it can continue to check frequently, even for a caller
500+
// in a tight loop, while still below the typical latencies of e.g. network
501+
// services and far below the default non-cooperative preemption interval.
502+
//
503+
// To determine if it is time to do expensive checks, we compare the current
504+
// time to the time we last checked, quantizing both by discarding the lower
505+
// 18 bits to arrive at that approx 0.25ms resolution. However even just
506+
// checking the current time is too expensive to do on every call in a hot
507+
// enough loop. Thus, if we are being called much more frequently than these
508+
// 0.25ms intervals, we can start checking the time less often instead of in
509+
// every call, backing off exponentially. Specifically, we can maintain a
510+
// calls-since-last-expensive-check counter and only check the clock when that
511+
// counter is of the form 2^k-1 (i.e. 1, 3, 7, 15, ...). This approach should
512+
// ensure a very frequent caller doesn't pay undue clock check overhead, while
513+
// an infrequent caller still checks often enough. NB: We choose -1 here
514+
// specifically so that the branch doing this check can also check for an
515+
// imminent counter overflow rather than checking for that separately on every
516+
// call.
517+
//
518+
// Both this call counter and the quantized timestamp are packed into a single
519+
// uint32 (yieldchecks): the upper 21 bits store the low bits of the quantized
520+
// timestamp and the lower 11 bits store the call counter. Given the counter
521+
// resets to half its value when saturated (at 2k-1), this results in
522+
// plateauing at a rate of 1 clock check per ~1k calls if called in a very
523+
// tight loop.
524+
//
525+
// Note: 21 bits gives us ~2M distinct 0.25ms quantized times before we wrap
526+
// around once every ~9 minutes. Since we compare exact equality, one would
527+
// need to not check the clock at all for ~9mins, then check it on the exact
528+
// 0.25ms tick to not see it change. To not check it at all for 9mins would
529+
// imply a dramatic reduction in Yield call frequency; given frequent calls
530+
// are what make Yield effective, this is not a practical concern.
531+
const yieldCountBits, yieldCountMask = 11, (1 << 11) - 1
532+
const yieldEpochShift = 18 - yieldCountBits // net right shift, accounting for 11 bits being masked out.
533+
gp.yieldchecks++
534+
// When the call counter -- masked out of the low 11 bits -- is of the form
535+
// 2^k-1, check the time.
536+
if count := gp.yieldchecks & yieldCountMask; (count & (count + 1)) == 0 {
537+
// prev is just the quantized last check time with the counter masked out.
538+
prev := gp.yieldchecks &^ yieldCountMask
539+
// Rather than shift down all 18 to quantize and then just shift back up 11
540+
// to make room for the counter, we can just shift down by the difference of
541+
// 7 and let the masking of the remaining 11 zero them out.
542+
now := uint32(nanotime()>>yieldEpochShift) &^ yieldCountMask
543+
if now != prev {
544+
gp.yieldchecks = now
545+
// Check runqs of all Ps; if we find in them anything, park this g to free
546+
// up this P to go to findRunnable and try to steal.
547+
for i := range allp {
548+
// We don't need the extra accuracy (and cost) of runqempty here either;
549+
// a racing steal or enqueue will get noticed when we next findRunnable
550+
// or next check yield.
551+
if allp[i].runqhead != allp[i].runqtail || allp[i].runnext != 0 {
552+
yieldPark()
553+
return
554+
}
555+
}
556+
557+
// Check netpoll; a ready conn is basically a should-be-runnable goroutine
558+
// to which we would yield if it had been readied, but the lack of idle Ps
559+
// may mean nobody is checking this as often right now and there may be
560+
// ready conns waiting.
561+
if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 {
562+
var found bool
563+
systemstack(func() {
564+
if list, delta := netpoll(0); !list.empty() {
565+
injectglist(&list)
566+
netpollAdjustWaiters(delta)
567+
found = true
568+
}
569+
})
570+
if found {
571+
// Since there were no idle Ps to get here, we can assume injectglist
572+
// put runnable Gs on our local runq, to which we can just goyield.
573+
goyield()
574+
}
575+
}
576+
} else if count == yieldCountMask {
577+
// Counter has saturated; reset counter bits to half saturation.
578+
gp.yieldchecks = prev | (yieldCountMask / 2)
579+
}
580+
}
581+
}
582+
583+
// yieldPark parks the current goroutine in a waiting state with reason yield
584+
// and puts it in the yieldq queue for findRunnable to retrieve at a later time
585+
// when the scheduler determines it has spare capacity. A goroutine parked by
586+
// Yield is no longer considered runnable and is instead waiting. One could ask
587+
// how this is different from a "runnable" goroutine waiting to run in a runq --
588+
// both are ready to run whenever the scheduler elects to run them -- but the
589+
// yielded goroutine has specifically opted to *block* until the scheduler next
590+
// has strictly spare capacity, in contrast to runnable goroutines which expect
591+
// to be run as soon as possible, perhaps even at the expense of running other
592+
// goroutines at that time. Reflecting this "blocked until something changes" in
593+
// its status -- particularly as it could end up spending significant time
594+
// waiting here, on the same order as other waiting states like blocking on IO
595+
// or locks -- better reflects the reality of its state.
596+
func yieldPark() {
597+
checkTimeouts()
598+
gopark(yield_put, nil, waitReasonYield, traceBlockPreempted, 1)
599+
}
600+
392601
// goschedguarded yields the processor like gosched, but also checks
393602
// for forbidden states and opts out of the yield in those cases.
394603
//
@@ -3546,6 +3755,23 @@ top:
35463755
}
35473756
}
35483757

3758+
// Nothing runnable, so check for yielded goroutines parked in yieldq.
3759+
if !sched.yieldq.empty() {
3760+
lock(&sched.lock)
3761+
bg := sched.yieldq.pop()
3762+
unlock(&sched.lock)
3763+
if bg != nil {
3764+
trace := traceAcquire()
3765+
casgstatus(bg, _Gwaiting, _Grunnable)
3766+
if trace.ok() {
3767+
// Match other ready paths for trace visibility.
3768+
trace.GoUnpark(bg, 0)
3769+
traceRelease(trace)
3770+
}
3771+
return bg, false, false
3772+
}
3773+
}
3774+
35493775
// We have nothing to do.
35503776
//
35513777
// If we're in the GC mark phase, can safely scan and blacken objects,
@@ -3616,6 +3842,12 @@ top:
36163842
}
36173843
return gp, false, false
36183844
}
3845+
3846+
// Re-check yieldq again, this time while holding sched.lock.
3847+
if !sched.yieldq.empty() {
3848+
unlock(&sched.lock)
3849+
goto top
3850+
}
36193851
if !mp.spinning && sched.needspinning.Load() == 1 {
36203852
// See "Delicate dance" comment below.
36213853
mp.becomeSpinning()
@@ -7416,6 +7648,20 @@ func (q *gQueue) popList() gList {
74167648
return stack
74177649
}
74187650

7651+
// yield_put is the gopark unlock function for Yield. It enqueues the goroutine
7652+
// onto the global yield queue. Returning true keeps the G parked until another
7653+
// part of the scheduler makes it runnable again. The G remains in _Gwaiting
7654+
// after this returns. Nothing else will find/ready this G in the interim since
7655+
// it isn't on a runq until we put it on the yieldq for findRunnable to find.
7656+
//
7657+
//go:nosplit
7658+
func yield_put(gp *g, _ unsafe.Pointer) bool {
7659+
lock(&sched.lock)
7660+
sched.yieldq.pushBack(gp)
7661+
unlock(&sched.lock)
7662+
return true
7663+
}
7664+
74197665
// A gList is a list of Gs linked through g.schedlink. A G can only be
74207666
// on one gQueue or gList at a time.
74217667
type gList struct {

src/runtime/proc_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,25 @@ func TestYieldLocked(t *testing.T) {
103103
<-c
104104
}
105105

106+
func TestYield(t *testing.T) {
107+
var wg sync.WaitGroup
108+
start := make(chan struct{})
109+
for i := 0; i < runtime.GOMAXPROCS(0)*2; i++ {
110+
wg.Add(1)
111+
go func() {
112+
defer wg.Done()
113+
<-start
114+
for j := 0; j < 1000; j++ {
115+
if i%2 == 0 || j == 999 {
116+
runtime.Yield()
117+
}
118+
}
119+
}()
120+
}
121+
close(start)
122+
wg.Wait()
123+
}
124+
106125
func TestGoroutineParallelism(t *testing.T) {
107126
if runtime.NumCPU() == 1 {
108127
// Takes too long, too easy to deadlock, etc.

src/runtime/runtime2.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,12 +485,14 @@ type g struct {
485485
sleepWhen int64 // when to sleep until
486486
selectDone atomic.Uint32 // are we participating in a select and did someone win the race?
487487

488+
yieldchecks uint32 // a packed approx time and count of maybeYield checks; see Yield().
489+
488490
// goroutineProfiled indicates the status of this goroutine's stack for the
489491
// current in-progress goroutine profile
490492
goroutineProfiled goroutineProfileStateHolder
491493

492-
coroarg *coro // argument during coroutine transfers
493-
bubble *synctestBubble
494+
coroarg *coro // argument during coroutine transfers
495+
bubble *synctestBubble
494496
lastsched int64 // timestamp when the G last started running
495497
runningnanos int64 // wall time spent in the running state
496498

@@ -797,6 +799,10 @@ type schedt struct {
797799
// Global runnable queue.
798800
runq gQueue
799801

802+
// Global background-yield queue: goroutines that voluntarily yielded
803+
// while the scheduler was busy. Does NOT contribute to runqsize.
804+
yieldq gQueue
805+
800806
// disable controls selective disabling of the scheduler.
801807
//
802808
// Use schedEnableUser to control this.
@@ -1094,6 +1100,7 @@ const (
10941100
waitReasonTraceProcStatus // "trace proc status"
10951101
waitReasonPageTraceFlush // "page trace flush"
10961102
waitReasonCoroutine // "coroutine"
1103+
waitReasonYield // "yield"
10971104
waitReasonGCWeakToStrongWait // "GC weak to strong wait"
10981105
waitReasonSynctestRun // "synctest.Run"
10991106
waitReasonSynctestWait // "synctest.Wait"
@@ -1144,6 +1151,7 @@ var waitReasonStrings = [...]string{
11441151
waitReasonTraceProcStatus: "trace proc status",
11451152
waitReasonPageTraceFlush: "page trace flush",
11461153
waitReasonCoroutine: "coroutine",
1154+
waitReasonYield: "yield",
11471155
waitReasonGCWeakToStrongWait: "GC weak to strong wait",
11481156
waitReasonSynctestRun: "synctest.Run",
11491157
waitReasonSynctestWait: "synctest.Wait",

0 commit comments

Comments
 (0)