@@ -389,6 +389,215 @@ func Gosched() {
389389 mcall (gosched_m )
390390}
391391
392+ // Yield cooperatively yields if, and only if, the scheduler is "busy".
393+ //
394+ // This can be called by any work wishing to utilize strictly spare capacity
395+ // while minimizing the degree to which it delays other work from being promptly
396+ // scheduled.
397+ //
398+ // Yield is intended to have very low overhead, particularly in its no-op case
399+ // where there is idle capacity in the scheduler and the caller does not need to
400+ // yield. This should allow it to be called often, such as in the body of tight
401+ // loops, in any tasks wishing to yield promptly to any waiting work.
402+ //
403+ // When there is waiting work, the yielding goroutine may briefly be rescheduled
404+ // after it, or may, in some cases, be parked in a waiting 'yield' state until
405+ // the scheduler next has spare capacity to resume it. Yield does not guarantee
406+ // fairness or starvation-prevention: once a goroutine Yields(), it may remain
407+ // parked until the scheduler next has idle capacity. This means Yield can block
408+ // for unbounded durations in the presence of sustained over-saturation; callers
409+ // are responsible for deciding where to Yield() to avoid priority inversions.
410+ //
411+ // Yield will never park if the calling goroutine is locked to an OS thread.
412+ func Yield () {
413+ // Common/fast case: do nothing if npidle is non-zero meaning there is
414+ // an idle P so no reason to yield this one. Doing only this check here keeps
415+ // Yield inlineable (~70 of 80 as of writing).
416+ if sched .npidle .Load () == 0 {
417+ maybeYield ()
418+ }
419+ }
420+
421+ // maybeYield is called by Yield if npidle is zero, meaning there are no idle Ps
422+ // and thus there may be work to which the caller should yield. Such work could
423+ // be on this local runq of the caller's P, on the global runq, in the runq of
424+ // some other P, or even in the form of ready conns waiting to be noticed by a
425+ // netpoll which would then ready runnable goroutines.
426+ //
427+ // Keeping this function extremely cheap is essential: it must be cheap enough
428+ // that callers can call it in very tight loops, as very frequent calls ensure a
429+ // task wishing to yield when work is waiting will do so promptly. Checking the
430+ // runq of every P or calling netpoll are too expensive to do in every call, so
431+ // given intent is to bound how long work may wait, such checks only need to be
432+ // performed after some amount of time has elapsed (e.g. 0.25ms). To minimize
433+ // overhead when called at a higher frequency, this elapsed time is checked with
434+ // an exponential backoff.
435+ //
436+ // runqs are checked directly with non-atomic reads rather than runqempty: being
437+ // cheap is our top priority and a microsecond of staleness is fine as long as
438+ // the check does not get optimized out of a calling loop body (hence noinline).
439+ //
440+ //go:noinline
441+ func maybeYield () {
442+ gp := getg ()
443+
444+ // Don't park while locked to an OS thread.
445+ if gp .lockedm != 0 {
446+ return
447+ }
448+
449+ // If the local P's runq ring buffer/next is non-empty, yield to waiting G.
450+ if p := gp .m .p .ptr (); p .runqhead != p .runqtail || p .runnext != 0 {
451+ // If there is work in the local P's runq, we can yield by just going to the
452+ // back of the local P's runq via goyield: this achieves the same goal of
453+ // letting waiting work run instead of us, but without parking on the global
454+ // yieldq and potentially switching Ps. While that's our preferred choice,
455+ // we want to avoid thrashing back and forth between multiple Yield-calling
456+ // goroutines: in such a case it is better to just park one of them on the
457+ // global yieldq so the other stops seeing it in the P's runq and yielding
458+ // to it.
459+ //
460+ // To detect and break such thrashing, we set yieldchecks to 1 before
461+ // calling goyield. If the other goroutine yields right back and we return
462+ // here still seeing yieldchecks==1, that indicates thrashing, so we park
463+ // instead. 1 is a valid packed prev+count value, with prev=0/count=1 so if
464+ // we later call Yield with no local runq and fall through to the
465+ // maybe-do-expensive-checks code below which will just increment it as
466+ // usual; when count=3 it will compare `prev=0` to the clock and do a check.
467+ if gp .yieldchecks == 1 {
468+ yieldPark ()
469+ return
470+ }
471+ gp .yieldchecks = 1
472+ // Go to the back of the local runq.
473+ goyield ()
474+ return
475+ }
476+
477+ // If the global runq is non-empty, park in the global yieldq right away: that
478+ // is work someone needs to pick up and it might as well be our P. We could,
479+ // potentially, directly claim it here and goyield or equivalently to try to
480+ // remain on this P, but just parking and letting this P go to findRunnable
481+ // avoids duplication of its logic and seems good enough.
482+ if ! sched .runq .empty () {
483+ yieldPark ()
484+ return
485+ }
486+
487+ // We didn't find anything via cheap O(1) checks of our runq or global runq
488+ // but it is possible there are goroutines waiting in runqs of other Ps that
489+ // are not being stolen by an idle P since the lack of idle Ps (npidle=0) is
490+ // what got us here. Furthermore, given the lack of idle Ps, it is also
491+ // possible that ready conns are waiting for a netpoll to notice them and
492+ // ready their goroutines i.e. work to which we should then yield.
493+ //
494+ // Searching all runqs, and even more so netpoll, is too expensive for every
495+ // maybeYield call. Since our goal is to bound how long work could wait for
496+ // yield-willing work to yeild to it what we instead can do is perform these
497+ // more expensive checks when enough time has passed. We define "enough" as
498+ // approximately 0.25ms: long enough to keep overhead paid by yield-willing
499+ // work low enough that it can continue to check frequently, even for a caller
500+ // in a tight loop, while still below the typical latencies of e.g. network
501+ // services and far below the default non-cooperative preemption interval.
502+ //
503+ // To determine if it is time to do expensive checks, we compare the current
504+ // time to the time we last checked, quantizing both by discarding the lower
505+ // 18 bits to arrive at that approx 0.25ms resolution. However even just
506+ // checking the current time is too expensive to do on every call in a hot
507+ // enough loop. Thus, if we are being called much more frequently than these
508+ // 0.25ms intervals, we can start checking the time less often instead of in
509+ // every call, backing off exponentially. Specifically, we can maintain a
510+ // calls-since-last-expensive-check counter and only check the clock when that
511+ // counter is of the form 2^k-1 (i.e. 1, 3, 7, 15, ...). This approach should
512+ // ensure a very frequent caller doesn't pay undue clock check overhead, while
513+ // an infrequent caller still checks often enough. NB: We choose -1 here
514+ // specifically so that the branch doing this check can also check for an
515+ // imminent counter overflow rather than checking for that separately on every
516+ // call.
517+ //
518+ // Both this call counter and the quantized timestamp are packed into a single
519+ // uint32 (yieldchecks): the upper 21 bits store the low bits of the quantized
520+ // timestamp and the lower 11 bits store the call counter. Given the counter
521+ // resets to half its value when saturated (at 2k-1), this results in
522+ // plateauing at a rate of 1 clock check per ~1k calls if called in a very
523+ // tight loop.
524+ //
525+ // Note: 21 bits gives us ~2M distinct 0.25ms quantized times before we wrap
526+ // around once every ~9 minutes. Since we compare exact equality, one would
527+ // need to not check the clock at all for ~9mins, then check it on the exact
528+ // 0.25ms tick to not see it change. To not check it at all for 9mins would
529+ // imply a dramatic reduction in Yield call frequency; given frequent calls
530+ // are what make Yield effective, this is not a practical concern.
531+ const yieldCountBits , yieldCountMask = 11 , (1 << 11 ) - 1
532+ const yieldEpochShift = 18 - yieldCountBits // net right shift, accounting for 11 bits being masked out.
533+ gp .yieldchecks ++
534+ // When the call counter -- masked out of the low 11 bits -- is of the form
535+ // 2^k-1, check the time.
536+ if count := gp .yieldchecks & yieldCountMask ; (count & (count + 1 )) == 0 {
537+ // prev is just the quantized last check time with the counter masked out.
538+ prev := gp .yieldchecks &^ yieldCountMask
539+ // Rather than shift down all 18 to quantize and then just shift back up 11
540+ // to make room for the counter, we can just shift down by the difference of
541+ // 7 and let the masking of the remaining 11 zero them out.
542+ now := uint32 (nanotime ()>> yieldEpochShift ) &^ yieldCountMask
543+ if now != prev {
544+ gp .yieldchecks = now
545+ // Check runqs of all Ps; if we find in them anything, park this g to free
546+ // up this P to go to findRunnable and try to steal.
547+ for i := range allp {
548+ // We don't need the extra accuracy (and cost) of runqempty here either;
549+ // a racing steal or enqueue will get noticed when we next findRunnable
550+ // or next check yield.
551+ if allp [i ].runqhead != allp [i ].runqtail || allp [i ].runnext != 0 {
552+ yieldPark ()
553+ return
554+ }
555+ }
556+
557+ // Check netpoll; a ready conn is basically a should-be-runnable goroutine
558+ // to which we would yield if it had been readied, but the lack of idle Ps
559+ // may mean nobody is checking this as often right now and there may be
560+ // ready conns waiting.
561+ if netpollinited () && netpollAnyWaiters () && sched .lastpoll .Load () != 0 {
562+ var found bool
563+ systemstack (func () {
564+ if list , delta := netpoll (0 ); ! list .empty () {
565+ injectglist (& list )
566+ netpollAdjustWaiters (delta )
567+ found = true
568+ }
569+ })
570+ if found {
571+ // Since there were no idle Ps to get here, we can assume injectglist
572+ // put runnable Gs on our local runq, to which we can just goyield.
573+ goyield ()
574+ }
575+ }
576+ } else if count == yieldCountMask {
577+ // Counter has saturated; reset counter bits to half saturation.
578+ gp .yieldchecks = prev | (yieldCountMask / 2 )
579+ }
580+ }
581+ }
582+
583+ // yieldPark parks the current goroutine in a waiting state with reason yield
584+ // and puts it in the yieldq queue for findRunnable to retrieve at a later time
585+ // when the scheduler determines it has spare capacity. A goroutine parked by
586+ // Yield is no longer considered runnable and is instead waiting. One could ask
587+ // how this is different from a "runnable" goroutine waiting to run in a runq --
588+ // both are ready to run whenever the scheduler elects to run them -- but the
589+ // yielded goroutine has specifically opted to *block* until the scheduler next
590+ // has strictly spare capacity, in contrast to runnable goroutines which expect
591+ // to be run as soon as possible, perhaps even at the expense of running other
592+ // goroutines at that time. Reflecting this "blocked until something changes" in
593+ // its status -- particularly as it could end up spending significant time
594+ // waiting here, on the same order as other waiting states like blocking on IO
595+ // or locks -- better reflects the reality of its state.
596+ func yieldPark () {
597+ checkTimeouts ()
598+ gopark (yield_put , nil , waitReasonYield , traceBlockPreempted , 1 )
599+ }
600+
392601// goschedguarded yields the processor like gosched, but also checks
393602// for forbidden states and opts out of the yield in those cases.
394603//
@@ -3546,6 +3755,23 @@ top:
35463755 }
35473756 }
35483757
3758+ // Nothing runnable, so check for yielded goroutines parked in yieldq.
3759+ if ! sched .yieldq .empty () {
3760+ lock (& sched .lock )
3761+ bg := sched .yieldq .pop ()
3762+ unlock (& sched .lock )
3763+ if bg != nil {
3764+ trace := traceAcquire ()
3765+ casgstatus (bg , _Gwaiting , _Grunnable )
3766+ if trace .ok () {
3767+ // Match other ready paths for trace visibility.
3768+ trace .GoUnpark (bg , 0 )
3769+ traceRelease (trace )
3770+ }
3771+ return bg , false , false
3772+ }
3773+ }
3774+
35493775 // We have nothing to do.
35503776 //
35513777 // If we're in the GC mark phase, can safely scan and blacken objects,
@@ -3616,6 +3842,12 @@ top:
36163842 }
36173843 return gp , false , false
36183844 }
3845+
3846+ // Re-check yieldq again, this time while holding sched.lock.
3847+ if ! sched .yieldq .empty () {
3848+ unlock (& sched .lock )
3849+ goto top
3850+ }
36193851 if ! mp .spinning && sched .needspinning .Load () == 1 {
36203852 // See "Delicate dance" comment below.
36213853 mp .becomeSpinning ()
@@ -7416,6 +7648,20 @@ func (q *gQueue) popList() gList {
74167648 return stack
74177649}
74187650
7651+ // yield_put is the gopark unlock function for Yield. It enqueues the goroutine
7652+ // onto the global yield queue. Returning true keeps the G parked until another
7653+ // part of the scheduler makes it runnable again. The G remains in _Gwaiting
7654+ // after this returns. Nothing else will find/ready this G in the interim since
7655+ // it isn't on a runq until we put it on the yieldq for findRunnable to find.
7656+ //
7657+ //go:nosplit
7658+ func yield_put (gp * g , _ unsafe.Pointer ) bool {
7659+ lock (& sched .lock )
7660+ sched .yieldq .pushBack (gp )
7661+ unlock (& sched .lock )
7662+ return true
7663+ }
7664+
74197665// A gList is a list of Gs linked through g.schedlink. A G can only be
74207666// on one gQueue or gList at a time.
74217667type gList struct {
0 commit comments