Skip to content

Commit de2a409

Browse files
author
CKI KWF Bot
committed
Merge: sched: sync shed_ext code with upstream
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-10/-/merge_requests/1498 JIRA: https://issues.redhat.com/browse/RHEL-111810 Update the sched_ext code with upstream. This is up to circa v6.16. The sched_ext code is fairly contained. It changes quickly still so I want to have the latest code which will benefit people experimenting with it. Signed-off-by: Phil Auld <pauld@redhat.com> Approved-by: Herton R. Krzesinski <herton@redhat.com> Approved-by: Waiman Long <longman@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: Felix Maurer <fmaurer@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: CKI GitLab Kmaint Pipeline Bot <26919896-cki-kmaint-pipeline-bot@users.noreply.gitlab.com>
2 parents 517121f + be632d8 commit de2a409

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3830
-1795
lines changed

MAINTAINERS

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20707,8 +20707,7 @@ S: Maintained
2070720707
W: https://github.com/sched-ext/scx
2070820708
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
2070920709
F: include/linux/sched/ext.h
20710-
F: kernel/sched/ext.h
20711-
F: kernel/sched/ext.c
20710+
F: kernel/sched/ext*
2071220711
F: tools/sched_ext/
2071320712
F: tools/testing/selftests/sched_ext
2071420713

include/linux/nodemask.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@
9494
#include <linux/bitmap.h>
9595
#include <linux/minmax.h>
9696
#include <linux/nodemask_types.h>
97-
#include <linux/numa.h>
9897
#include <linux/random.h>
9998

10099
extern nodemask_t _unused_nodemask_arg_;
@@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s
191190
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
192191
}
193192

193+
#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
194+
static __always_inline void __nodes_copy(nodemask_t *dstp,
195+
const nodemask_t *srcp, unsigned int nbits)
196+
{
197+
bitmap_copy(dstp->bits, srcp->bits, nbits);
198+
}
199+
194200
#define nodes_complement(dst, src) \
195201
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
196202
static __always_inline void __nodes_complement(nodemask_t *dstp,

include/linux/nodemask_types.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@
33
#define __LINUX_NODEMASK_TYPES_H
44

55
#include <linux/bitops.h>
6-
#include <linux/numa.h>
6+
7+
#ifdef CONFIG_NODES_SHIFT
8+
#define NODES_SHIFT CONFIG_NODES_SHIFT
9+
#else
10+
#define NODES_SHIFT 0
11+
#endif
12+
13+
#define MAX_NUMNODES (1 << NODES_SHIFT)
14+
15+
#define NUMA_NO_NODE (-1)
716

817
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
918

include/linux/numa.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,8 @@
33
#define _LINUX_NUMA_H
44
#include <linux/init.h>
55
#include <linux/types.h>
6+
#include <linux/nodemask.h>
67

7-
#ifdef CONFIG_NODES_SHIFT
8-
#define NODES_SHIFT CONFIG_NODES_SHIFT
9-
#else
10-
#define NODES_SHIFT 0
11-
#endif
12-
13-
#define MAX_NUMNODES (1 << NODES_SHIFT)
14-
15-
#define NUMA_NO_NODE (-1)
168
#define NUMA_NO_MEMBLK (-1)
179

1810
static inline bool numa_valid_node(int nid)
@@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid);
3931
/* Generic implementation available */
4032
int numa_nearest_node(int node, unsigned int state);
4133

34+
int nearest_node_nodemask(int node, nodemask_t *mask);
35+
4236
#ifndef memory_add_physaddr_to_nid
4337
int memory_add_physaddr_to_nid(u64 start);
4438
#endif
@@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state)
5549
return NUMA_NO_NODE;
5650
}
5751

52+
static inline int nearest_node_nodemask(int node, nodemask_t *mask)
53+
{
54+
return NUMA_NO_NODE;
55+
}
56+
5857
static inline int memory_add_physaddr_to_nid(u64 start)
5958
{
6059
return 0;

include/linux/sched/ext.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ struct sched_ext_entity {
146146
u32 weight;
147147
s32 sticky_cpu;
148148
s32 holding_cpu;
149+
s32 selected_cpu;
149150
u32 kf_mask; /* see scx_kf_mask above */
150151
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
151152
atomic_long_t ops_state;
@@ -163,7 +164,7 @@ struct sched_ext_entity {
163164

164165
/*
165166
* Runtime budget in nsecs. This is usually set through
166-
* scx_bpf_dispatch() but can also be modified directly by the BPF
167+
* scx_bpf_dsq_insert() but can also be modified directly by the BPF
167168
* scheduler. Automatically decreased by SCX as the task executes. On
168169
* depletion, a scheduling event is triggered.
169170
*
@@ -175,10 +176,10 @@ struct sched_ext_entity {
175176

176177
/*
177178
* Used to order tasks when dispatching to the vtime-ordered priority
178-
* queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
179-
* but can also be modified directly by the BPF scheduler. Modifying it
180-
* while a task is queued on a dsq may mangle the ordering and is not
181-
* recommended.
179+
* queue of a dsq. This is usually set through
180+
* scx_bpf_dsq_insert_vtime() but can also be modified directly by the
181+
* BPF scheduler. Modifying it while a task is queued on a dsq may
182+
* mangle the ordering and is not recommended.
182183
*/
183184
u64 dsq_vtime;
184185

@@ -205,12 +206,22 @@ struct sched_ext_entity {
205206
void sched_ext_free(struct task_struct *p);
206207
void print_scx_info(const char *log_lvl, struct task_struct *p);
207208
void scx_softlockup(u32 dur_s);
209+
bool scx_rcu_cpu_stall(void);
208210

209211
#else /* !CONFIG_SCHED_CLASS_EXT */
210212

211213
static inline void sched_ext_free(struct task_struct *p) {}
212214
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
213215
static inline void scx_softlockup(u32 dur_s) {}
216+
static inline bool scx_rcu_cpu_stall(void) { return false; }
214217

215218
#endif /* CONFIG_SCHED_CLASS_EXT */
219+
220+
struct scx_task_group {
221+
#ifdef CONFIG_EXT_GROUP_SCHED
222+
u32 flags; /* SCX_TG_* */
223+
u32 weight;
224+
#endif
225+
};
226+
216227
#endif /* _LINUX_SCHED_EXT_H */

include/linux/topology.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
261261
}
262262
#endif /* CONFIG_NUMA */
263263

264+
/**
265+
* for_each_node_numadist() - iterate over nodes in increasing distance
266+
* order, starting from a given node
267+
* @node: the iteration variable and the starting node.
268+
* @unvisited: a nodemask to keep track of the unvisited nodes.
269+
*
270+
* This macro iterates over NUMA node IDs in increasing distance from the
271+
* starting @node and yields MAX_NUMNODES when all the nodes have been
272+
* visited.
273+
*
274+
* Note that by the time the loop completes, the @unvisited nodemask will
275+
* be fully cleared, unless the loop exits early.
276+
*
277+
* The difference between for_each_node() and for_each_node_numadist() is
278+
* that the former allows to iterate over nodes in numerical order, whereas
279+
* the latter iterates over nodes in increasing order of distance.
280+
*
281+
* This complexity of this iterator is O(N^2), where N represents the
282+
* number of nodes, as each iteration involves scanning all nodes to
283+
* find the one with the shortest distance.
284+
*
285+
* Requires rcu_lock to be held.
286+
*/
287+
#define for_each_node_numadist(node, unvisited) \
288+
for (int __start = (node), \
289+
(node) = nearest_node_nodemask((__start), &(unvisited)); \
290+
(node) < MAX_NUMNODES; \
291+
node_clear((node), (unvisited)), \
292+
(node) = nearest_node_nodemask((__start), &(unvisited)))
293+
264294
/**
265295
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
266296
* from a given node.

include/trace/events/sched_ext.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump,
2626
)
2727
);
2828

29+
TRACE_EVENT(sched_ext_event,
30+
TP_PROTO(const char *name, __s64 delta),
31+
TP_ARGS(name, delta),
32+
33+
TP_STRUCT__entry(
34+
__string(name, name)
35+
__field( __s64, delta )
36+
),
37+
38+
TP_fast_assign(
39+
__assign_str(name);
40+
__entry->delta = delta;
41+
),
42+
43+
TP_printk("name %s delta %lld",
44+
__get_str(name), __entry->delta
45+
)
46+
);
47+
2948
#endif /* _TRACE_SCHED_EXT_H */
3049

3150
/* This part must be outside protection */

kernel/rcu/tree_stall.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,13 @@ static void panic_on_rcu_stall(void)
142142
{
143143
static int cpu_stall;
144144

145+
/*
146+
* Attempt to kick out the BPF scheduler if it's installed and defer
147+
* the panic to give the system a chance to recover.
148+
*/
149+
if (scx_rcu_cpu_stall())
150+
return;
151+
145152
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
146153
return;
147154

kernel/sched/build_policy.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959

6060
#ifdef CONFIG_SCHED_CLASS_EXT
6161
# include "ext.c"
62+
# include "ext_idle.c"
6263
#endif
6364

6465
#include "syscalls.c"

kernel/sched/core.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3890,13 +3890,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
38903890

38913891
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
38923892
{
3893-
/*
3894-
* The BPF scheduler may depend on select_task_rq() being invoked during
3895-
* wakeups. In addition, @p may end up executing on a different CPU
3896-
* regardless of what happens in the wakeup path making the ttwu_queue
3897-
* optimization less meaningful. Skip if on SCX.
3898-
*/
3899-
if (task_on_scx(p))
3893+
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
3894+
if (!scx_allow_ttwu_queue(p))
39003895
return false;
39013896

39023897
#ifdef CONFIG_SMP
@@ -8442,7 +8437,7 @@ void __init sched_init(void)
84428437
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
84438438
#endif /* CONFIG_FAIR_GROUP_SCHED */
84448439
#ifdef CONFIG_EXT_GROUP_SCHED
8445-
root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
8440+
scx_tg_init(&root_task_group);
84468441
#endif /* CONFIG_EXT_GROUP_SCHED */
84478442
#ifdef CONFIG_RT_GROUP_SCHED
84488443
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8877,7 +8872,7 @@ struct task_group *sched_create_group(struct task_group *parent)
88778872
if (!alloc_rt_sched_group(tg, parent))
88788873
goto err;
88798874

8880-
scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
8875+
scx_tg_init(tg);
88818876
alloc_uclamp_sched_group(tg, parent);
88828877

88838878
return tg;
@@ -9286,7 +9281,7 @@ static unsigned long tg_weight(struct task_group *tg)
92869281
#ifdef CONFIG_FAIR_GROUP_SCHED
92879282
return scale_load_down(tg->shares);
92889283
#else
9289-
return sched_weight_from_cgroup(tg->scx_weight);
9284+
return sched_weight_from_cgroup(tg->scx.weight);
92909285
#endif
92919286
}
92929287

0 commit comments

Comments
 (0)