From ccca67268157830c550abd70ceb1df5bb4db27d0 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Fri, 13 Mar 2026 14:45:38 +0000 Subject: [PATCH 001/115] arm_mpam: Ensure in_reset_state is false after applying configuration The per-RIS flag, in_reset_state, indicates whether or not the MSC registers are in reset state, and allows avoiding resetting when they are already in reset state. However, when mpam_apply_config() updates the configuration it doesn't update the in_reset_state flag and so even after the configuration update in_reset_state can be true and mpam_reset_ris() will skip the actual register restoration on subsequent resets. Once resctrl has a MPAM backend it will use resctrl_arch_reset_all_ctrls() to reset the MSC configuration on unmount and, if the in_reset_state flag is bogusly true, fail to reset the MSC configuration. The resulting non-reset MSC configuration can lead to persistent performance restrictions even after resctrl is unmounted. Fix by clearing in_reset_state to false immediately after successful configuration application, ensuring that the next reset operation properly restores MSC register defaults. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Zeng Heng Acked-by: Ben Horgan [Horgan: rewrite commit message to not be specific to resctrl unmount] Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse (cherry picked from commit f91e913355f49c878fc77f995fd71b7800352bd2) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0666be6b0e88d..3c7e69de753ef 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2694,6 +2694,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, srcu_read_lock_held(&mpam_srcu)) { arg.ris = ris; mpam_touch_msc(msc, __write_config, &arg); + ris->in_reset_state = false; } mutex_unlock(&msc->cfg_lock); } From ef8bca84b71ce604a60e8f2e7c2b4682672ab703 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:39 +0000 Subject: [PATCH 002/115] arm_mpam: Reset when feature configuration bit unset To indicate that the configuration, of the controls used by resctrl, in a RIS need resetting to driver defaults the reset flags in mpam_config are set. However, these flags are only ever set temporarily at RIS scope in mpam_reset_ris() and hence mpam_cpu_online() will never reset these controls to default. As the hardware reset is unknown this leads to unknown configuration when the control values haven't been configured away from the defaults. Use the policy that an unset feature configuration bit means reset. In this way the mpam_config in the component can encode that it should be in reset state and mpam_reprogram_msc() will reset controls as needed. Fixes: 09b89d2a72f3 ("arm_mpam: Allow configuration to be applied and restored during cpu online") Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick [ morse: Removed unused reset flags from config structure ] Signed-off-by: James Morse (cherry picked from commit a1cb6577f575ba5ec2583caf4f791a86754dbf69) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 40 ++++++++++----------------------- drivers/resctrl/mpam_internal.h | 4 ---- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 3c7e69de753ef..740d99dc847eb 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1364,17 +1364,15 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, __mpam_intpart_sel(ris->ris_idx, partid, msc); } - if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); - else + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) { + if (mpam_has_feature(mpam_feat_cpor_part, cfg)) mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + else + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); } - if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_part, cfg)) mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); else mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); @@ -1384,16 +1382,14 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_has_feature(mpam_feat_mbw_min, cfg)) mpam_write_partsel_reg(msc, MBW_MIN, 0); - if (mpam_has_feature(mpam_feat_mbw_max, rprops) && - mpam_has_feature(mpam_feat_mbw_max, cfg)) { - if (cfg->reset_mbw_max) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); - else + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + else + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); } - if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && - mpam_has_feature(mpam_feat_mbw_prop, cfg)) + if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) @@ -1493,16 +1489,6 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) -{ - *reset_cfg = (struct mpam_config) { - .reset_cpbm = true, - .reset_mbw_pbm = true, - .reset_mbw_max = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); -} - /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1510,14 +1496,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) static int mpam_reset_ris(void *arg) { u16 partid, partid_max; - struct mpam_config reset_cfg; + struct mpam_config reset_cfg = {}; struct mpam_msc_ris *ris = arg; if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); - spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e8971842b124f..7af762c98efc4 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -266,10 +266,6 @@ struct mpam_config { u32 mbw_pbm; u16 mbw_max; - bool reset_cpbm; - bool reset_mbw_pbm; - bool reset_mbw_max; - struct mpam_garbage garbage; }; From 362574f34511a6d8b1aa3c9c38cc9f1d1707e05f Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:40 +0000 Subject: [PATCH 003/115] arm64/sysreg: Add MPAMSM_EL1 register The MPAMSM_EL1 register determines the MPAM configuration for an SMCU. Add the register definition. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 29fa1be82b83f87e603ed4c21fe86c6e05fd0282) Signed-off-by: Fenghua Yu --- arch/arm64/tools/sysreg | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9d1c211080571..1287cb1de6f3c 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -5172,6 +5172,14 @@ Field 31:16 PARTID_D Field 15:0 PARTID_I EndSysreg +Sysreg MPAMSM_EL1 3 0 10 5 3 +Res0 63:48 +Field 47:40 PMG_D +Res0 39:32 +Field 31:16 PARTID_D +Res0 15:0 +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS From b402200378342fc2af0afd80d6e3ea8f62e465a7 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:41 +0000 Subject: [PATCH 004/115] KVM: arm64: Preserve host MPAM configuration when changing traps When KVM enables or disables MPAM traps to EL2 it clears all other bits in MPAM2_EL2. Notably, it clears the partition ids (PARTIDs) and performance monitoring groups (PMGs). Avoid changing these bits in anticipation of adding support for MPAM in the kernel. Otherwise, on a VHE system with the host running at EL2 where MPAM2_EL2 and MPAM1_EL1 access the same register, any attempt to use MPAM to monitor or partition resources for kernel space would be foiled by running a KVM guest. Additionally, MPAM2_EL2.EnMPAMSM is always set to 0 which causes MPAMSM_EL1 to always trap. Keep EnMPAMSM set to 1 when not in a guest so that the kernel can use MPAMSM_EL1. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit eda1cd1f9d29b382a07d757cf8b29f9ee636355f) Signed-off-by: Fenghua Yu --- arch/arm64/kvm/hyp/include/hyp/switch.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 2597e8bda8672..0b50ddd530f3e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -277,18 +278,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); From 825d05c6bce9098ade6c12e9adf9053d928a13a5 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:42 +0000 Subject: [PATCH 005/115] KVM: arm64: Make MPAMSM_EL1 accesses UNDEF The MPAMSM_EL1 register controls the MPAM labeling for an SMCU, Streaming Mode Compute Unit. As there is no MPAM support in KVM, make sure MPAMSM_EL1 accesses trigger an UNDEF. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Marc Zyngier Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 2e7c684bdb50cfaf98da80ebaab4a961fdcd1aa2) Signed-off-by: Fenghua Yu --- arch/arm64/kvm/sys_regs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b4cacb6e918a..0edd655934a97 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3376,6 +3376,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_MPAMSM_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, From 95aeaa4ec701f4f4058e7aa261c47ac24489ff80 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:43 +0000 Subject: [PATCH 006/115] arm64: mpam: Context switch the MPAM registers MPAM allows traffic in the SoC to be labeled by the OS, these labels are used to apply policy in caches and bandwidth regulators, and to monitor traffic in the SoC. The label is made up of a PARTID and PMG value. The x86 equivalent calls these CLOSID and RMID, but they don't map precisely. MPAM has two CPU system registers that is used to hold the PARTID and PMG values that traffic generated at each exception level will use. These can be set per-task by the resctrl file system. (resctrl is the defacto interface for controlling this stuff). Add a helper to switch this. struct task_struct's separate CLOSID and RMID fields are insufficient to implement resctrl using MPAM, as resctrl can change the PARTID (CLOSID) and PMG (sort of like the RMID) separately. On x86, the rmid is an independent number, so a race that writes a mismatched closid and rmid into hardware is benign. On arm64, the pmg bits extend the partid. (i.e. partid-5 has a pmg-0 that is not the same as partid-6's pmg-0). In this case, mismatching the values will 'dirty' a pmg value that resctrl believes is clean, and is not tracking with its 'limbo' code. To avoid this, the partid and pmg are always read and written as a pair. This requires a new u64 field. In struct task_struct there are two u32, rmid and closid for the x86 case, but as we can't use them here do something else. Add this new field, mpam_partid_pmg, to struct thread_info to avoid adding more architecture specific code to struct task_struct. Always use READ_ONCE()/WRITE_ONCE() when accessing this field. Resctrl allows a per-cpu 'default' value to be set, this overrides the values when scheduling a task in the default control-group, which has PARTID 0. The way 'code data prioritisation' gets emulated means the register value for the default group needs to be a variable. The current system register value is kept in a per-cpu variable to avoid writing to the system register if the value isn't going to change. Writes to this register may reset the hardware state for regulating bandwidth. Finally, there is no reason to context switch these registers unless there is a driver changing the values in struct task_struct. Hide the whole thing behind a static key. This also allows the driver to disable MPAM in response to errors reported by hardware. Move the existing static key to belong to the arch code, as in the future the MPAM driver may become a loadable module. All this should depend on whether there is an MPAM driver, hide it behind CONFIG_ARM64_MPAM. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick CC: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 8e06d04ff1cf764066c62e5677bfb0b0c1d1fbbc) Signed-off-by: Fenghua Yu --- arch/arm64/Kconfig | 2 + arch/arm64/include/asm/mpam.h | 67 ++++++++++++++++++++++++++++ arch/arm64/include/asm/thread_info.h | 3 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/mpam.c | 13 ++++++ arch/arm64/kernel/process.c | 7 +++ drivers/resctrl/mpam_devices.c | 2 - drivers/resctrl/mpam_internal.h | 4 +- 8 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/mpam.h create mode 100644 arch/arm64/kernel/mpam.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d4b97eebf9965..f04c272757993 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2040,6 +2040,8 @@ config ARM64_MPAM MPAM is exposed to user-space via the resctrl pseudo filesystem. + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 0000000000000..0747e0526927d --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include + +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in mpam_thread_switch(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field as + * mpam_thread_switch() may read a partid and pmg that don't match, causing this + * value to be stored with cache allocations, despite being considered 'free' by + * resctrl. + */ +#ifdef CONFIG_ARM64_MPAM +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!static_branch_likely(&mpam_enabled)) + return; + + if (regval == READ_ONCE(arm64_mpam_global_default)) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#else +static inline void mpam_thread_switch(struct task_struct *tsk) {} +#endif /* CONFIG_ARM64_MPAM */ + +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 7942478e40658..5d7fe3e153c85 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fe627100d1990..74b76bb704523 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 0000000000000..9866d2ca0faa9 --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 489554931231e..47698955fa1e4 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -738,6 +739,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 740d99dc847eb..ae0562a7ce218 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,8 +29,6 @@ #include "mpam_internal.h" -DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ - /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 7af762c98efc4..a13fb9880cede 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -16,12 +16,12 @@ #include #include +#include + #define MPAM_MSC_MAX_NUM_RIS 16 struct platform_device; -DECLARE_STATIC_KEY_FALSE(mpam_enabled); - #ifdef CONFIG_MPAM_KUNIT_TEST #define PACKED_FOR_KUNIT __packed #else From 6d0c883bbff2c68a1943572df2bba51fbd38668a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:44 +0000 Subject: [PATCH 007/115] arm64: mpam: Re-initialise MPAM regs when CPU comes online Now that the MPAM system registers are expected to have values that change, reprogram them based on the previous value when a CPU is brought online. Previously MPAM's 'default PARTID' of 0 was always used for MPAM in kernel-space as this is the PARTID that hardware guarantees to reset. Because there are a limited number of PARTID, this value is exposed to user-space, meaning resctrl changes to the resctrl default group would also affect kernel threads. Instead, use the task's PARTID value for kernel work on behalf of user-space too. The default of 0 is kept for both user-space and kernel-space when MPAM is not enabled. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 87b78a5d70e83d4dbe31e1afda2be736a3330b31) Signed-off-by: Fenghua Yu --- arch/arm64/kernel/cpufeature.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 32c2dbcc0c641..18d7555ea98bc 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -2501,13 +2502,17 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool From b427b9fa82c7ca15967b0fea6543469d1c886853 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:45 +0000 Subject: [PATCH 008/115] arm64: mpam: Drop the CONFIG_EXPERT restriction In anticipation of MPAM being useful remove the CONFIG_EXPERT restriction. This was done to prevent the driver being enabled before the user-space interface was wired up. Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Ben Horgan [ morse: Added second paragraph ] Signed-off-by: James Morse (cherry picked from commit c544f00a473239835d22e7109b403314d8b85974) Signed-off-by: Fenghua Yu --- arch/arm64/Kconfig | 2 +- drivers/resctrl/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f04c272757993..00d79552a3c11 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2017,7 +2017,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" - select ARM64_MPAM_DRIVER if EXPERT # does nothing yet + select ARM64_MPAM_DRIVER select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) is an diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c808e04703946..c34e059c6e41f 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,6 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" - depends on ARM64 && ARM64_MPAM && EXPERT + depends on ARM64 && ARM64_MPAM help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. From e10ca7a8e03717e370644f203af742ee0e831ad7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:46 +0000 Subject: [PATCH 009/115] arm64: mpam: Advertise the CPUs MPAM limits to the driver Requesters need to populate the MPAM fields for any traffic they send on the interconnect. For the CPUs these values are taken from the corresponding MPAMy_ELx register. Each requester may have a limit on the largest PARTID or PMG value that can be used. The MPAM driver has to determine the system-wide minimum supported PARTID and PMG values. To do this, the driver needs to be told what each requestor's limit is. CPUs are special, but this infrastructure is also needed for the SMMU and GIC ITS. Call the helper to tell the MPAM driver what the CPUs can do. The return value can be ignored by the arch code as it runs well before the MPAM driver starts probing. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan [ morse: requestor->requester as argued by ispell ] Signed-off-by: James Morse (cherry picked from commit 831a7f16728c5ceef04ab99a699c3d9e519dc4b8) Signed-off-by: Fenghua Yu --- arch/arm64/kernel/mpam.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 9866d2ca0faa9..e6feff2324acb 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -3,6 +3,7 @@ #include +#include #include #include @@ -11,3 +12,14 @@ DEFINE_PER_CPU(u64, arm64_mpam_default); DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) From 3379accac5f238c59edf9fa43505eb566040865a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:47 +0000 Subject: [PATCH 010/115] arm64: mpam: Add cpu_pm notifier to restore MPAM sysregs The MPAM system registers will be lost if the CPU is reset during PSCI's CPU_SUSPEND. Add a PM notifier to restore them. mpam_thread_switch(current) can't be used as this won't make any changes if the in-memory copy says the register already has the correct value. In reality the system register is UNKNOWN out of reset. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 735dad999905dfd246be1994bb8d203063aeb0d6) Signed-off-by: Fenghua Yu --- arch/arm64/kernel/mpam.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index e6feff2324acb..48ec0ffd59997 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,12 +14,44 @@ DEFINE_PER_CPU(u64, arm64_mpam_current); u64 arm64_mpam_global_default; +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + static int __init arm64_mpam_register_cpus(void) { u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); return mpam_register_requestor(partid_max, pmg_max); } /* Must occur before mpam_msc_driver_init() from subsys_initcall() */ From 6a0490ffc03c9c7db3e0ab456ba26e9895ad03de Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:48 +0000 Subject: [PATCH 011/115] arm64: mpam: Initialise and context switch the MPAMSM_EL1 register The MPAMSM_EL1 sets the MPAM labels, PMG and PARTID, for loads and stores generated by a shared SMCU. Disable the traps so the kernel can use it and set it to the same configuration as the per-EL cpu MPAM configuration. If an SMCU is not shared with other cpus then it is implementation defined whether the configuration from MPAMSM_EL1 is used or that from the appropriate MPAMy_ELx. As we set the same, PMG_D and PARTID_D, configuration for MPAM0_EL1, MPAM1_EL1 and MPAMSM_EL1 the resulting configuration is the same regardless. The range of valid configurations for the PARTID and PMG in MPAMSM_EL1 is not currently specified in Arm Architectural Reference Manual but the architect has confirmed that it is intended to be the same as that for the cpu configuration in the MPAMy_ELx registers. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Reviewed-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 37fe0f984d9ca60e8d95fc9a85d37f4300159625) Signed-off-by: Fenghua Yu --- arch/arm64/include/asm/el2_setup.h | 3 ++- arch/arm64/include/asm/mpam.h | 2 ++ arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/mpam.c | 4 ++++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 85f4c1615472d..4d15071a4f3fc 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -513,7 +513,8 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 .Linit_mpam_\@: - msr_s SYS_MPAM2_EL2, xzr // use the default partition + mov x0, #MPAM2_EL2_EnMPAMSM_MASK + msr_s SYS_MPAM2_EL2, x0 // use the default partition, // and disable lower traps mrs_s x0, SYS_MPAMIDR_EL1 tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 0747e0526927d..6bccbfdccb87e 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -53,6 +53,8 @@ static inline void mpam_thread_switch(struct task_struct *tsk) return; write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 18d7555ea98bc..f57c2ff98326b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2509,6 +2509,8 @@ cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); isb(); /* Synchronising the EL0 write is left until the ERET to EL0 */ diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c index 48ec0ffd59997..3a490de4fa125 100644 --- a/arch/arm64/kernel/mpam.c +++ b/arch/arm64/kernel/mpam.c @@ -28,6 +28,10 @@ static int mpam_pm_notifier(struct notifier_block *self, */ regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } isb(); write_sysreg_s(regval, SYS_MPAM0_EL1); From 7d4f7f9a7c6485986732bca9d98ad72c78255271 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:49 +0000 Subject: [PATCH 012/115] arm64: mpam: Add helpers to change a task or cpu's MPAM PARTID/PMG values Care must be taken when modifying the PARTID and PMG of a task in any per-task structure as writing these values may race with the task being scheduled in, and reading the modified values. Add helpers to set the task properties, and the CPU default value. These use WRITE_ONCE() that pairs with the READ_ONCE() in mpam_get_regval() to avoid causing torn values. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Cc: Dave Martin Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 2cf9ca3fae38b7894e7f1435cec92f9a679b42f9) Signed-off-by: Fenghua Yu --- arch/arm64/include/asm/mpam.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 6bccbfdccb87e..05aa71200f61a 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include @@ -22,6 +23,23 @@ DECLARE_PER_CPU(u64, arm64_mpam_current); */ extern u64 arm64_mpam_global_default; +#ifdef CONFIG_ARM64_MPAM +static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i) +{ + return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) | + FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) | + FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); +} + +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + /* * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, * which may race with reads in mpam_thread_switch(). Ensure only one of the old @@ -30,12 +48,20 @@ extern u64 arm64_mpam_global_default; * value to be stored with cache allocations, despite being considered 'free' by * resctrl. */ -#ifdef CONFIG_ARM64_MPAM static inline u64 mpam_get_regval(struct task_struct *tsk) { return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); } +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +} + static inline void mpam_thread_switch(struct task_struct *tsk) { u64 oldregval; From 244511f0bb33b791c2dffcd1cd248643ef3dfd31 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:51 +0000 Subject: [PATCH 013/115] arm_mpam: resctrl: Add boilerplate cpuhp and domain allocation resctrl has its own data structures to describe its resources. We can't use these directly as we play tricks with the 'MBA' resource, picking the MPAM controls or monitors that best apply. We may export the same component as both L3 and MBA. Add mpam_resctrl_res[] as the array of class->resctrl mappings we are exporting, and add the cpuhp hooks that allocated and free the resctrl domain structures. Only the mpam control feature are considered here and monitor support will be added later. While we're here, plumb in a few other obvious things. CONFIG_ARM_CPU_RESCTRL is used to allow this code to be built even though it can't yet be linked against resctrl. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 09e61daf8e96b9bdb04dd112bdecf9382fd3f919) Signed-off-by: Fenghua Yu --- drivers/resctrl/Makefile | 1 + drivers/resctrl/mpam_devices.c | 12 ++ drivers/resctrl/mpam_internal.h | 21 +++ drivers/resctrl/mpam_resctrl.c | 324 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 3 + 5 files changed, 361 insertions(+) create mode 100644 drivers/resctrl/mpam_resctrl.c diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 898199dcf80d5..40beaf999582c 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ae0562a7ce218..e35acf8c25d93 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1614,6 +1614,9 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } + if (mpam_is_enabled()) + return mpam_resctrl_online_cpu(cpu); + return 0; } @@ -1657,6 +1660,9 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; + if (mpam_is_enabled()) + mpam_resctrl_offline_cpu(cpu); + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -2502,6 +2508,12 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + if (err) { mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index a13fb9880cede..43c8e0f5f7ac5 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -333,6 +334,16 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + struct rdt_ctrl_domain resctrl_ctrl_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -387,6 +398,16 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +#endif /* CONFIG_RESCTRL_FS */ + /* * MPAM MSCs have the following register layout. See: * Arm Memory System Resource Partitioning and Monitoring (MPAM) System diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 0000000000000..9a30709704142 --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +bool resctrl_arch_alloc_capable(void) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) +{ + /* TODO: initialise the resctrl resources */ + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + /* TODO: repaint domain ids to match the L3 domain ids */ + /* Otherwise, expose the ID used by the firmware table code. */ + return comp->comp_id; +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_held(&domain_list_lock); + + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del_rcu(&hdr->list); + synchronize_rcu(); + return true; + } + + return false; +} + +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err; + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + + /* class has no component for this CPU */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (r->alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + return dom; + +free_domain: + kfree(dom); + dom = ERR_PTR(err); + + return dom; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + return NULL; +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res); + if (IS_ERR(dom)) + return PTR_ERR(dom); + } else { + if (r->alloc_capable) { + struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; + + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + } + } + + resctrl_online_cpu(cpu); + + return 0; +} + +void mpam_resctrl_offline_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + resctrl_offline_cpu(cpu); + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + if (r->alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } else { + ctrl_dom_empty = true; + } + + if (ctrl_dom_empty) + kfree(dom); + } +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + cpus_read_lock(); + for_each_mpam_resctrl_control(res, rid) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + res->resctrl_res.rid = rid; + } + + /* TODO: pick MPAM classes to map to resctrl resources */ + + /* Initialise the resctrl structures from the classes */ + for_each_mpam_resctrl_control(res, rid) { + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res); + if (err) { + pr_debug("Failed to initialise rid %u\n", rid); + break; + } + } + cpus_read_unlock(); + + if (err) { + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; + } + + if (!resctrl_arch_alloc_capable()) { + pr_debug("No alloc(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable()); + return -EOPNOTSUPP; + } + + /* TODO: call resctrl_init() */ + + return 0; +} diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7f00c5285a326..2c7d1413a401f 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,6 +49,9 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From c8bf60c4c8ee7dd37204817d381e7a01b9ddf70b Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:52 +0000 Subject: [PATCH 014/115] arm_mpam: resctrl: Pick the caches we will use as resctrl resources Systems with MPAM support may have a variety of control types at any point of their system layout. We can only expose certain types of control, and only if they exist at particular locations. Start with the well-known caches. These have to be depth 2 or 3 and support MPAM's cache portion bitmap controls, with a number of portions fewer than resctrl's limit. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 52a4edb16121d07734e4e392767d26d286f08c35) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 91 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9a30709704142..65bb670dc3fb1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -65,9 +65,95 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* resctrl uses u32 for all bitmap configurations */ + return class->props.cpbm_wd <= 32; +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { - /* TODO: initialise the resctrl resources */ + struct mpam_class *class = res->class; + struct rdt_resource *r = &res->resctrl_res; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + r->cdp_capable = true; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->cdp_capable = true; + } + + /* + * Which bits are shared with other ...things... Unknown + * devices use partid-0 which uses all the bitmap fields. Until + * we have configured the SMMU and GIC not to do this 'all the + * bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->alloc_capable = true; + break; + default: + return -EINVAL; + } return 0; } @@ -292,7 +378,8 @@ int mpam_resctrl_setup(void) res->resctrl_res.rid = rid; } - /* TODO: pick MPAM classes to map to resctrl resources */ + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { From 81c270c0b08819e3e28435a4bb1ecd49f6741e95 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:53 +0000 Subject: [PATCH 015/115] arm_mpam: resctrl: Implement resctrl_arch_reset_all_ctrls() We already have a helper for resetting an mpam class and component. Hook it up to resctrl_arch_reset_all_ctrls() and the domain offline path. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Shaopeng Tan Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 370d166d878d0c0aa06568d67387a1151a200501) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 2 +- drivers/resctrl/mpam_internal.h | 3 +++ drivers/resctrl/mpam_resctrl.c | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e35acf8c25d93..90751729e49be 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2553,7 +2553,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp) } } -static void mpam_reset_class_locked(struct mpam_class *class) +void mpam_reset_class_locked(struct mpam_class *class) { struct mpam_component *comp; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 43c8e0f5f7ac5..f063a741aaba2 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -388,6 +388,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +/* Reset all the RIS in a class under cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); + int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 65bb670dc3fb1..b2217d11561d8 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,19 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, enum resctrl_res_level rid, struct rdt_domain_hdr *hdr) From b9dc7e3b30b8026ccc786d783341c62d0a14b6a3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:54 +0000 Subject: [PATCH 016/115] arm_mpam: resctrl: Add resctrl_arch_get_config() Implement resctrl_arch_get_config() by testing the live configuration for a CPOR bitmap. For any other configuration type return the default. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 02cc661687886563a0e08ecee51c5ef7d1737237) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index b2217d11561d8..3af57b6f2c1b5 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -170,6 +170,49 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return resctrl_get_default_ctrl(r); + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + default: + return resctrl_get_default_ctrl(r); + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + return resctrl_get_default_ctrl(r); + + switch (configured_by) { + case mpam_feat_cpor_part: + return cfg->cpbm; + default: + return resctrl_get_default_ctrl(r); + } +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From 39a8356bc7c7e55137f14e11daee8106bd803794 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:55 +0000 Subject: [PATCH 017/115] arm_mpam: resctrl: Implement helpers to update configuration resctrl has two helpers for updating the configuration. resctrl_arch_update_one() updates a single value, and is used by the software-controller to apply feedback to the bandwidth controls, it has to be called on one of the CPUs in the resctrl:domain. resctrl_arch_update_domains() copies multiple staged configurations, it can be called from anywhere. Both helpers should update any changes to the underlying hardware. Implement resctrl_arch_update_domains() to use resctrl_arch_update_one(). Neither need to be called on a specific CPU as the mpam driver will send IPIs as needed. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 9cd2b522be2cc64fab179d75537d2e8df38d26a6) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 3af57b6f2c1b5..ea60777934ffd 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -213,6 +213,76 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + /* + * No need to check the CPU as mpam_apply_config() doesn't care, and + * resctrl_arch_update_domains() relies on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + default: + return -EINVAL; + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err; + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { + for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { + struct resctrl_staged_config *cfg = &d->staged_config[t]; + + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return 0; +} + void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct mpam_resctrl_res *res; From 465853344c658488162880a6047be479b4082573 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:56 +0000 Subject: [PATCH 018/115] arm_mpam: resctrl: Add plumbing against arm64 task and cpu hooks arm64 provides helpers for changing a task's and a cpu's mpam partid/pmg values. These are used to back a number of resctrl_arch_ functions. Connect them up. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 9d2e1a99fae58ce992f147bdf83b5d9089f70b27) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 58 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 63 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index ea60777934ffd..9cde5b7e644cc 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,8 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +static bool cdp_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -57,6 +60,61 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 2c7d1413a401f..5a78299ec464b 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -52,6 +52,11 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 7c3b4648da125bfc2ab1d4e1a2c1dc8003df1684 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:45:57 +0000 Subject: [PATCH 019/115] arm_mpam: resctrl: Add CDP emulation Intel RDT's CDP feature allows the cache to use a different control value depending on whether the accesses was for instruction fetch or a data access. MPAM's equivalent feature is the other way up: the CPU assigns a different partid label to traffic depending on whether it was instruction fetch or a data access, which causes the cache to use a different control value based solely on the partid. MPAM can emulate CDP, with the side effect that the alternative partid is seen by all MSC, it can't be enabled per-MSC. Add the resctrl hooks to turn this on or off. Add the helpers that match a closid against a task, which need to be aware that the value written to hardware is not the same as the one resctrl is using. Update the 'arm64_mpam_global_default' variable the arch code uses during context switch to know when the per-cpu value should be used instead. Also, update these per-cpu values and sync the resulting mpam partid/pmg configuration to hardware. resctrl can enable CDP for L2 caches, L3 caches or both. When it is enabled by one and not the other MPAM globally enabled CDP but hides the effect on the other cache resource. This hiding is possible as CPOR is the only supported cache control and that uses a resource bitmap; two partids with the same bitmap act as one. Awkwardly, the MB controls don't implement CDP and CDP can't be hidden as the memory bandwidth control is a maximum per partid which can't be modelled with more partids. If the total maximum is used for both the data and instruction partids then then the maximum may be exceeded and if it is split in two then the one using more bandwidth will hit a lower limit. Hence, hide the MB controls completely if CDP is enabled for any resource. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Cc: Dave Martin Cc: Amit Singh Tomar Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 6789fb99282c0a8e8e84701b7edf456f4a9e71e2) Signed-off-by: Fenghua Yu --- arch/arm64/include/asm/mpam.h | 1 + drivers/resctrl/mpam_internal.h | 1 + drivers/resctrl/mpam_resctrl.c | 122 ++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 2 + 4 files changed, 126 insertions(+) diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 05aa71200f61a..70d396e7b6da8 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,7 @@ #ifndef __ASM__MPAM_H #define __ASM__MPAM_H +#include #include #include #include diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f063a741aaba2..2751eeaba302d 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -342,6 +342,7 @@ struct mpam_resctrl_dom { struct mpam_resctrl_res { struct mpam_class *class; struct rdt_resource resctrl_res; + bool cdp_enabled; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9cde5b7e644cc..2111542f485e1 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -35,6 +35,10 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ static bool cdp_enabled; bool resctrl_arch_alloc_capable(void) @@ -50,6 +54,74 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + return mpam_resctrl_controls[rid].cdp_enabled; +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + int cpu; + + /* + * resctrl_arch_set_cdp_enabled() is only called with enable set to + * false on error and unmount. + */ + cdp_enabled = enable; + mpam_resctrl_controls[rid].cdp_enabled = enable; + + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ + if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + + if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && + mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + + if (enable) { + if (mpam_partid_max < 1) + return -EINVAL; + + partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); + partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); + } + + mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); + WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); + + resctrl_reset_task_closids(); + + for_each_possible_cpu(cpu) + mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); + on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + /* * MSC may raise an error interrupt if it sees an out or range partid/pmg, * and go on to truncate the value. Regardless of what the hardware supports, @@ -115,6 +187,30 @@ void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) } } +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) @@ -247,6 +343,14 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + partid = resctrl_get_config_index(closid, type); cfg = &dom->ctrl_comp->cfg[partid]; @@ -274,6 +378,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + int err; u32 partid; struct mpam_config cfg; struct mpam_props *cprops; @@ -291,6 +396,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); cprops = &res->class->props; + if (mpam_resctrl_hide_cdp(r->rid)) + t = CDP_DATA; + partid = resctrl_get_config_index(closid, t); if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { pr_debug("Not alloc capable or computed PARTID out of range\n"); @@ -313,6 +421,20 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, return -EINVAL; } + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); } diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 5a78299ec464b..d329b1dc148ba 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -56,6 +56,8 @@ void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From baecc1eb0abe6cedaf40d0b69694c7c06927d28b Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:45:58 +0000 Subject: [PATCH 020/115] arm_mpam: resctrl: Hide CDP emulation behind CONFIG_EXPERT When CDP is not enabled, the 'rmid_entry's in the limbo list, rmid_busy_llc, map directly to a (PARTID,PMG) pair and when CDP is enabled the mapping is to two different pairs. As the limbo list is reused between mounts and CDP disabled on unmount this can lead to stale mapping and the limbo handler will then make monitor reads with potentially out of range PARTID. This may then cause an MPAM error interrupt and the driver will disable MPAM. No problems are expected if you just mount the resctrl file system once with CDP enabled and never unmount it. Hide CDP emulation behind CONFIG_EXPERT to protect the unwary. Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Zeng Heng Reviewed-by: James Morse Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse (cherry picked from commit 01a0021f6c39557037bfc41ede7230a0696677ff) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 2111542f485e1..2331e6ddb814b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -82,6 +82,18 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; int cpu; + if (!IS_ENABLED(CONFIG_EXPERT) && enable) { + /* + * If the resctrl fs is mounted more than once, sequentially, + * then CDP can lead to the use of out of range PARTIDs. + */ + pr_warn("CDP not supported\n"); + return -EOPNOTSUPP; + } + + if (enable) + pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); + /* * resctrl_arch_set_cdp_enabled() is only called with enable set to * false on error and unmount. From 33377dda2fe906de69c4e6398e91ec6512bec184 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 13 Mar 2026 14:45:59 +0000 Subject: [PATCH 021/115] arm_mpam: resctrl: Convert to/from MPAMs fixed-point formats MPAM uses a fixed-point formats for some hardware controls. Resctrl provides the bandwidth controls as a percentage. Add helpers to convert between these. Ensure bwa_wd is at most 16 to make it clear higher values have no meaning. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Dave Martin Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 80d147d293130ee3c8a395cbbea1813e26ab9a1b) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 7 +++++ drivers/resctrl/mpam_resctrl.c | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 90751729e49be..506deba05b40c 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -713,6 +713,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_mbw_part, props); props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 2331e6ddb814b..240a06df2f079 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -242,6 +243,56 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * + * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 + * as it reports the width of fields that are at most 16 bits. When + * fewer than 16 bits are valid the least significant bits are + * ignored. The implied binary point is kept between bits 15 and 16 and + * so the valid bits are leftmost. + * + * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format" for more information. + * + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { From 7b58c9757e136f58146bcf715daaabb1b2c9b881 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:00 +0000 Subject: [PATCH 022/115] arm_mpam: resctrl: Add rmid index helpers Because MPAM's pmg aren't identical to RDT's rmid, resctrl handles some data structures by index. This allows x86 to map indexes to RMID, and MPAM to map them to partid-and-pmg. Add the helpers to do this. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Suggested-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 3e9b35823aabcb85cc039960256426e50f1fd601) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 16 ++++++++++++++++ include/linux/arm_mpam.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 240a06df2f079..370830ab11197 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -145,6 +145,22 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) return mpam_partid_max + 1; } +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return (mpam_pmg_max + 1) * (mpam_partid_max + 1); +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return closid * (mpam_pmg_max + 1) + rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = idx / (mpam_pmg_max + 1); + *rmid = idx % (mpam_pmg_max + 1); +} + void resctrl_arch_sched_in(struct task_struct *tsk) { lockdep_assert_preemption_disabled(); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index d329b1dc148ba..7d23c90f077dc 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -58,6 +58,9 @@ void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); void resctrl_arch_sched_in(struct task_struct *tsk); bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); /** * mpam_register_requestor() - Register a requestor with the MPAM driver From e2da3bf9903f3b6154985f45908fc23dd3395832 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:01 +0000 Subject: [PATCH 023/115] arm_mpam: resctrl: Wait for cacheinfo to be ready In order to calculate the rmid realloc threshold the size of the cache needs to be known. Cache domains will also be named after the cache id. So that this information can be extracted from cacheinfo we need to wait for it to be ready. The cacheinfo information is populated in device_initcall() so we wait for that. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 1c1e2968a860c5af9fca67f1c0e88aab83ace0b3) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 370830ab11197..bf91cff05daf7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -42,6 +43,13 @@ static DEFINE_MUTEX(domain_list_lock); */ static bool cdp_enabled; +/* + * We use cacheinfo to discover the size of the caches and their id. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -757,6 +765,8 @@ int mpam_resctrl_setup(void) struct mpam_resctrl_res *res; enum resctrl_res_level rid; + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); @@ -794,3 +804,12 @@ int mpam_resctrl_setup(void) return 0; } + +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); From 57d50d780a4a3c47aede57047b1d7e5a62f4bfc9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:02 +0000 Subject: [PATCH 024/115] arm_mpam: resctrl: Add support for 'MB' resource resctrl supports 'MB', as a percentage throttling of traffic from the L3. This is the control that mba_sc uses, so ideally the class chosen should be as close as possible to the counters used for mbm_total. If there is a single L3, it's the last cache, and the topology of the memory matches then the traffic at the memory controller will be equivalent to that at egress of the L3. If these conditions are met allow the memory class to back MB. MB's percentage control should be backed either with the fixed point fraction MBW_MAX or bandwidth portion bitmaps. The bandwidth portion bitmaps is not used as its tricky to pick which bits to use to avoid contention, and may be possible to expose this as something other than a percentage in the future. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Dave Martin Signed-off-by: Dave Martin Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 36528c7681b8093f5f9270d2af7c4326d771f181) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 281 ++++++++++++++++++++++++++++++++- 1 file changed, 280 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index bf91cff05daf7..60d111f7abfd5 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -267,6 +267,33 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_max(cprops); +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) + return 0; + + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); +} + /* * Each fixed-point hardware value architecturally represents a range * of values: the full range 0% - 100% is split contiguously into @@ -317,6 +344,160 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return val; } +static u32 get_mba_min(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) { + WARN_ON_ONCE(1); + return 0; + } + + return mbw_max_to_percent(0, cprops); +} + +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the world's a Xeon, and all counters are on the + * L3. We allow some mapping counters on other classes. This requires + * that the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper which can tell us about offline CPUs ... but getting the cache_id + * to start with relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the victim component list and compare the affinity mask with the + * corresponding L3. The topology matches if each victim:component's affinity + * mask is the same as the CPU's corresponding L3's. These lists/masks are + * computed from firmware tables so don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + + lockdep_assert_cpus_held(); + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return false; + } + + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return false; + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return false; + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return false; + } + } + + return true; +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; + } + + if (num_possible_nodes() > 1) { + pr_debug("There is more than one numa node\n"); + return false; + } + +#ifdef CONFIG_HMEM_REPORTING + if (node_devices[cpu_to_node(cpu)]->cache_dev) { + pr_debug("There is a memory side cache\n"); + return false; + } +#endif + + return true; +} + /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { @@ -358,9 +539,68 @@ static void mpam_resctrl_pick_caches(void) } } +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { + pr_debug("class %u is a cache but not the L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", + class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", + class->level); + continue; + } + + if (!traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + + /* + * Pick a resource to be MBA that as close as possible to + * the L3. mbm_total counts the bandwidth leaving the L3 + * cache and MBA should correspond as closely as possible + * for proper operation of mba_sc. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", + candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; struct rdt_resource *r = &res->resctrl_res; switch (r->rid) { @@ -392,6 +632,19 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->cache.shareable_bits = resctrl_get_default_ctrl(r); r->alloc_capable = true; break; + case RDT_RESOURCE_MBA: + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_min(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + r->alloc_capable = true; + break; default: return -EINVAL; } @@ -406,7 +659,17 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; - /* TODO: repaint domain ids to match the L3 domain ids */ + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + /* Otherwise, expose the ID used by the firmware table code. */ return comp->comp_id; } @@ -446,6 +709,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; default: return resctrl_get_default_ctrl(r); } @@ -457,6 +726,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); default: return resctrl_get_default_ctrl(r); } @@ -504,6 +775,13 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; default: return -EINVAL; } @@ -775,6 +1053,7 @@ int mpam_resctrl_setup(void) /* Find some classes to use for controls */ mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); /* Initialise the resctrl structures from the classes */ for_each_mpam_resctrl_control(res, rid) { From aa243ac0f2d4d4830d53f1d0b524433205a6f065 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 13 Mar 2026 14:46:03 +0000 Subject: [PATCH 025/115] arm_mpam: resctrl: Add kunit test for control format conversions resctrl specifies the format of the control schemes, and these don't match the hardware. Some of the conversions are a bit hairy - add some kunit tests. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Dave Martin [morse: squashed enough of Dave's fixes in here that it's his patch now!] Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 5dc8f73eaa5dfccb229b9a25c797720e6379f8e0) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 4 + drivers/resctrl/test_mpam_resctrl.c | 315 ++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 drivers/resctrl/test_mpam_resctrl.c diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 60d111f7abfd5..f8d4666fbaa85 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1092,3 +1092,7 @@ static int __init __cacheinfo_ready(void) return 0; } device_initcall_sync(__cacheinfo_ready); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 0000000000000..b93d6ad87e43f --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from the union of ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only) and ARM IHI0099B.a + * "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format": + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK_U32(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_MAX */ + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); From 525443bd5c05299c634896bb759558291c2e3217 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:04 +0000 Subject: [PATCH 026/115] arm_mpam: resctrl: Add monitor initialisation and domain boilerplate Add the boilerplate that tells resctrl about the mpam monitors that are available. resctrl expects all (non-telemetry) monitors to be on the L3 and so advertise them there and invent an L3 resctrl resource if required. The L3 cache itself has to exist as the cache ids are used as the domain ids. Bring the resctrl monitor domains online and offline based on the cpus they contain. Support for specific monitor types is left to later. Tested-by: Punit Agrawal Reviewed-by: Zeng Heng Reviewed-by: Jonathan Cameron Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse (cherry picked from commit 264c285999fce128fc52743bce582468b26e9f65) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_internal.h | 15 +++ drivers/resctrl/mpam_resctrl.c | 231 ++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 11 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 2751eeaba302d..301cf5c151bd9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -336,7 +336,16 @@ struct mpam_msc_ris { struct mpam_resctrl_dom { struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; }; struct mpam_resctrl_res { @@ -345,6 +354,12 @@ struct mpam_resctrl_res { bool cdp_enabled; }; +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index f8d4666fbaa85..e03d0f400993c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -34,6 +34,23 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; rid < RDT_NUM_RESOURCES; \ rid++, res = &mpam_resctrl_controls[rid]) +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. Restrict the events considered + * to those supported by MPAM. + * Class pointer may be NULL. + */ +#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; + +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -63,6 +80,15 @@ bool resctrl_arch_alloc_capable(void) return false; } +bool resctrl_arch_mon_capable(void) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; @@ -89,6 +115,8 @@ static void resctrl_reset_task_closids(void) int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; int cpu; if (!IS_ENABLED(CONFIG_EXPERT) && enable) { @@ -110,6 +138,11 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) cdp_enabled = enable; mpam_resctrl_controls[rid].cdp_enabled = enable; + if (enable) + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; + else + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; @@ -674,6 +707,56 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) + return 0; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + /* + * num-rmid is the upper bound for the number of monitoring groups that + * can exist simultaneously, including the default monitoring group for + * each control group. Hence, advertise the whole rmid_idx space even + * though each control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture to correctly + * interpret this value. + */ + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + if (resctrl_enable_mon_event(type, false, 0, NULL)) + l3->mon_capable = true; + + return 0; +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -901,11 +984,26 @@ static void mpam_resctrl_domain_insert(struct list_head *list, list_add_tail_rcu(&new->list, pos); } +static struct mpam_component *find_component(struct mpam_class *class, int cpu) +{ + struct mpam_component *comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp->affinity)) + return comp; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) { int err; struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; struct mpam_class *class = res->class; struct mpam_component *comp_iter, *ctrl_comp; @@ -945,8 +1043,56 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } else { pr_debug("Skipped control domain online - no controls\n"); } + + if (r->mon_capable) { + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + /* + * Even if the monitor domain is backed by a different + * component, the L3 component IDs need to be used... only + * there may be no ctrl_comp for the L3. + * Search each event's class list for a component with + * overlapping CPUs and set up the dom->mon_comp array. + */ + + for_each_mpam_resctrl_mon(mon, eventid) { + struct mpam_component *mon_comp; + + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[eventid] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } + + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + return dom; +offline_ctrl_domain: + if (r->alloc_capable) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } free_domain: kfree(dom); dom = ERR_PTR(err); @@ -954,6 +1100,35 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) return dom; } +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + int cache_id; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + lockdep_assert_cpus_held(); + + if (!l3->class) + return NULL; + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id < 0) + return NULL; + + list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (dom->resctrl_mon_dom.hdr.id == cache_id) + return dom; + } + + return NULL; +} + static struct mpam_resctrl_dom * mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) { @@ -967,7 +1142,11 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return dom; } - return NULL; + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); } int mpam_resctrl_online_cpu(unsigned int cpu) @@ -994,6 +1173,11 @@ int mpam_resctrl_online_cpu(unsigned int cpu) mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); } + if (r->mon_capable) { + struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; + + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } } } @@ -1012,8 +1196,9 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - bool ctrl_dom_empty; + bool ctrl_dom_empty, mon_dom_empty; struct rdt_resource *r = &res->resctrl_res; if (!res->class) @@ -1032,7 +1217,16 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) ctrl_dom_empty = true; } - if (ctrl_dom_empty) + if (r->mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } else { + mon_dom_empty = true; + } + + if (ctrl_dom_empty && mon_dom_empty) kfree(dom); } } @@ -1042,12 +1236,15 @@ int mpam_resctrl_setup(void) int err = 0; struct mpam_resctrl_res *res; enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; wait_event(wait_cacheinfo_ready, cacheinfo_ready); cpus_read_lock(); for_each_mpam_resctrl_control(res, rid) { INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); res->resctrl_res.rid = rid; } @@ -1063,25 +1260,37 @@ int mpam_resctrl_setup(void) err = mpam_resctrl_control_init(res); if (err) { pr_debug("Failed to initialise rid %u\n", rid); - break; + goto internal_error; } } - cpus_read_unlock(); - if (err) { - pr_debug("Internal error %d - resctrl not supported\n", err); - return err; + for_each_mpam_resctrl_mon(mon, eventid) { + if (!mon->class) + continue; // dummy resource + + err = mpam_resctrl_monitor_init(mon, eventid); + if (err) { + pr_debug("Failed to initialise event %u\n", eventid); + goto internal_error; + } } - if (!resctrl_arch_alloc_capable()) { - pr_debug("No alloc(%u) found - resctrl not supported\n", - resctrl_arch_alloc_capable()); + cpus_read_unlock(); + + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); return -EOPNOTSUPP; } /* TODO: call resctrl_init() */ return 0; + +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; } static int __init __cacheinfo_ready(void) From 2ef3779ca8e8d15ad3360a7805be2d4499f91466 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:05 +0000 Subject: [PATCH 027/115] arm_mpam: resctrl: Add support for csu counters resctrl exposes a counter via a file named llc_occupancy. This isn't really a counter as its value goes up and down, this is a snapshot of the cache storage usage monitor. Add some picking code which will only find an L3. The resctrl counter file is called llc_occupancy but we don't check it is the last one as it is already identified as L3. Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Dave Martin Signed-off-by: Dave Martin Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 1458c4f053355f88cc5d190ca02243d2c60fa010) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 83 ++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e03d0f400993c..07bb20a01b383 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -311,6 +311,28 @@ static bool class_has_usable_mba(struct mpam_props *cprops) return mba_class_use_mbw_max(cprops); } +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -630,6 +652,64 @@ static void mpam_resctrl_pick_mba(void) } } +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } + + if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + /* The name of the resource is L3... */ + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a cache but not the L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", + class->level); + continue; + } + + if (cache_has_usable_csu(class)) { + pr_debug("class %u has usable CSU", + class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + break; + default: + break; + } + } + } +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; @@ -1264,6 +1344,9 @@ int mpam_resctrl_setup(void) } } + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + for_each_mpam_resctrl_mon(mon, eventid) { if (!mon->class) continue; // dummy resource From 658399c34655d8671c0b648b4cbf158156daa97d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:06 +0000 Subject: [PATCH 028/115] arm_mpam: resctrl: Allow resctrl to allocate monitors When resctrl wants to read a domain's 'QOS_L3_OCCUP', it needs to allocate a monitor on the corresponding resource. Monitors are allocated by class instead of component. Add helpers to allocate a CSU monitor. These helper return an out of range value for MBM counters. Allocating a montitor context is expected to block until hardware resources become available. This only makes sense for QOS_L3_OCCUP as unallocated MBM counters are losing data. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 2a3c79c61539779a09928893518c8286d7774b54) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_internal.h | 14 ++++++- drivers/resctrl/mpam_resctrl.c | 67 +++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 301cf5c151bd9..85b2b99263601 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -29,6 +29,14 @@ struct platform_device; #define PACKED_FOR_KUNIT #endif +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -216,7 +224,11 @@ enum mon_filter_options { }; struct mon_cfg { - u16 mon; + /* + * mon must be large enough to hold out of range values like + * USE_PRE_ALLOCATED + */ + u32 mon; u8 pmg; bool match_pmg; bool csu_exclude_clean; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 07bb20a01b383..9682ffb151846 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -22,6 +22,8 @@ #include "mpam_internal.h" +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + /* * The classes we've picked to map to resctrl resources, wrapped * in with their resctrl structure. @@ -289,6 +291,71 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) return &mpam_resctrl_controls[l].resctrl_res; } +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc_obj(*ret); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7d23c90f077dc..e1461e32af756 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include struct mpam_msc; @@ -62,6 +63,10 @@ u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); u32 resctrl_arch_system_num_rmid_idx(void); +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 864fa5794d2422ac2a847cdb8b268a77470b0084 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:07 +0000 Subject: [PATCH 029/115] arm_mpam: resctrl: Add resctrl_arch_rmid_read() resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Jesse Chick Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit fb56b29932ca276df268806ad52ed80f40f99a6e) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 82 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 5 +++ 2 files changed, 87 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9682ffb151846..9a15ddd340f73 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -356,6 +356,88 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } +static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + cfg = (struct mon_cfg) { + .mon = mon_idx, + .match_pmg = true, + .partid = closid, + .pmg = rmid, + }; + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 code_val = 0, data_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &code_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &data_val); + if (err) + return err; + + *val += code_val + data_val; + return 0; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode (not supported), and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); + mon_comp = l3_dom->mon_comp[eventid]; + + if (eventid != QOS_L3_OCCUP_EVENT_ID) + return -EINVAL; + + mon_type = mpam_feat_msmon_csu; + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index e1461e32af756..86d5e326d2bd3 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,11 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. From 812ea7b36935afd0ff1c177ac4fff3ca78551341 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:08 +0000 Subject: [PATCH 030/115] arm_mpam: resctrl: Update the rmid reallocation limit resctrl's limbo code needs to be told when the data left in a cache is small enough for the partid+pmg value to be re-allocated. x86 uses the cache size divided by the number of rmid users the cache may have. Do the same, but for the smallest cache, and with the number of partid-and-pmg users. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 49b04e401825431529e866470d8d2dcd8e9ef058) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 9a15ddd340f73..f82fff3519df4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -438,6 +438,42 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, closid, rmid, val); } +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -850,6 +886,9 @@ static void mpam_resctrl_pick_counters(void) /* CSU counters only make sense on a cache. */ switch (class->type) { case MPAM_CLASS_CACHE: + if (update_rmid_limits(class)) + break; + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); break; default: From 581305c7e9cb6d9a4001806df60b66fb4ee17113 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:09 +0000 Subject: [PATCH 031/115] arm_mpam: resctrl: Add empty definitions for assorted resctrl functions A few resctrl features and hooks need to be provided, but aren't needed or supported on MPAM platforms. resctrl has individual hooks to separately enable and disable the closid/partid and rmid/pmg context switching code. For MPAM this is all the same thing, as the value in struct task_struct is used to cache the value that should be written to hardware. arm64's context switching code is enabled once MPAM is usable, but doesn't touch the hardware unless the value has changed. For now event configuration is not supported, and can be turned off by returning 'false' from resctrl_arch_is_evt_configurable(). The new io_alloc feature is not supported either, always return false from the enable helper to indicate and fail the enable. Add this, and empty definitions for the other hooks. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit efc775eadce2c6e0921c21d9c29a7b6686022281) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 65 ++++++++++++++++++++++++++++++++++ include/linux/arm_mpam.h | 9 +++++ 2 files changed, 74 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index f82fff3519df4..777ecdc2d0f85 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -91,6 +91,71 @@ bool resctrl_arch_mon_capable(void) return l3->mon_capable; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EINVAL; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +void resctrl_arch_pre_mount(void) +{ +} + bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) { return mpam_resctrl_controls[rid].cdp_enabled; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 86d5e326d2bd3..f92a36187a527 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -67,6 +67,15 @@ struct rdt_resource; void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; From 1a287f8bd40fd49aa2e4101e69be804a9a01d838 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 27 Mar 2026 16:30:53 +0100 Subject: [PATCH 032/115] ALSA: usb-audio: Replace hard-coded number with MAX_CHANNELS One place in mixer.c still used a hard-coded number 16 instead of MAX_CHANNELS. Replace with it, so that we can extend the max number of channels gracefully. Link: https://lore.kernel.org/F1B104A5-CD6A-4A26-AB46-14BF233C0579@getmailspring.com Tested-by: Phil Willoughby Link: https://patch.msgid.link/20260327153056.691575-1-tiwai@suse.de Signed-off-by: Takashi Iwai (cherry picked from commit ec9a788620be1c11535fe99e9b2779f9eef2b099) Signed-off-by: Fenghua Yu --- sound/usb/mixer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index ac8c71ba94834..09c42300fb48c 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1707,7 +1707,7 @@ static void __build_feature_ctl(struct usb_mixer_interface *mixer, cval->master_readonly = readonly_mask; } else { int i, c = 0; - for (i = 0; i < 16; i++) + for (i = 0; i < MAX_CHANNELS; i++) if (ctl_mask & BIT(i)) c++; cval->channels = c; From 0bbfc0145fea64305e2d30333b4097899423788d Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:10 +0000 Subject: [PATCH 033/115] arm64: mpam: Select ARCH_HAS_CPU_RESCTRL Enough MPAM support is present to enable ARCH_HAS_CPU_RESCTRL. Let it rip^Wlink! ARCH_HAS_CPU_RESCTRL indicates resctrl can be enabled. It is enabled by the arch code simply because it has 'arch' in its name. This removes ARM_CPU_RESCTRL as a mimic of X86_CPU_RESCTRL. While here, move the ACPI dependency to the driver's Kconfig file. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Acked-by: Catalin Marinas Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 4aab135bda1661a795e4fe96418bf840833e1119) Signed-off-by: Fenghua Yu --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/resctrl.h | 2 ++ drivers/resctrl/Kconfig | 7 +++++++ drivers/resctrl/Makefile | 2 +- 4 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/include/asm/resctrl.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 00d79552a3c11..241659f285a86 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2018,7 +2018,7 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" select ARM64_MPAM_DRIVER - select ACPI_MPAM if ACPI + select ARCH_HAS_CPU_RESCTRL help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 0000000000000..b506e95cf6e37 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c34e059c6e41f..672abea3b03cc 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,7 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. @@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST If unsure, say N. endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 40beaf999582c..4f6d0e81f9b8f 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o -mpam-$(CONFIG_ARM_CPU_RESCTRL) += mpam_resctrl.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG From 9cb919d7b81e90ab39a157b5d437ad5b56fd7bf3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:11 +0000 Subject: [PATCH 034/115] arm_mpam: resctrl: Call resctrl_init() on platforms that can support resctrl Now that MPAM links against resctrl, call resctrl_init() to register the filesystem and setup resctrl's structures. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit fb481ec08699e9daf08ab839a79ab37b1bcca94d) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++--- drivers/resctrl/mpam_internal.h | 4 +++ drivers/resctrl/mpam_resctrl.c | 63 ++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 506deba05b40c..2c65e4c46ed56 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -73,6 +73,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +/* + * Whether resctrl has been setup. Used by cpuhp in preference to + * mpam_is_enabled(). The disable call after an error interrupt makes + * mpam_is_enabled() false before the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -1621,7 +1629,7 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) return mpam_resctrl_online_cpu(cpu); return 0; @@ -1667,7 +1675,7 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; - if (mpam_is_enabled()) + if (mpam_resctrl_enabled) mpam_resctrl_offline_cpu(cpu); guard(srcu)(&mpam_srcu); @@ -2528,6 +2536,7 @@ static void mpam_enable_once(void) } static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -2587,24 +2596,39 @@ static void mpam_reset_class(struct mpam_class *class) void mpam_disable(struct work_struct *ignored) { int idx; + bool do_resctrl_exit; struct mpam_class *class; struct mpam_msc *msc, *tmp; + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + mutex_lock(&mpam_cpuhp_state_lock); if (mpam_cpuhp_state) { cpuhp_remove_state(mpam_cpuhp_state); mpam_cpuhp_state = 0; } + + /* + * Removing the cpuhp state called mpam_cpu_offline() and told resctrl + * all the CPUs are offline. + */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; mutex_unlock(&mpam_cpuhp_state_lock); - static_branch_disable(&mpam_enabled); + if (do_resctrl_exit) + mpam_resctrl_exit(); mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, - srcu_read_lock_held(&mpam_srcu)) + srcu_read_lock_held(&mpam_srcu)) { mpam_reset_class(class); + if (do_resctrl_exit) + mpam_resctrl_teardown_class(class); + } srcu_read_unlock(&mpam_srcu, idx); mutex_lock(&mpam_list_lock); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 85b2b99263601..68906c6ebfb01 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -431,12 +431,16 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); int mpam_resctrl_online_cpu(unsigned int cpu); void mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); #else static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } #endif /* CONFIG_RESCTRL_FS */ /* diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 777ecdc2d0f85..a9938006d0e6e 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -69,6 +69,12 @@ static bool cdp_enabled; static bool cacheinfo_ready; static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -360,6 +366,9 @@ static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return -EINVAL; + if (!mon->class) return -EINVAL; @@ -402,6 +411,9 @@ static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, { struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + if (!mpam_is_enabled()) + return; + if (!mon->class) return; @@ -488,6 +500,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, resctrl_arch_rmid_read_context_check(); + if (!mpam_is_enabled()) + return -EINVAL; + if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; @@ -1162,6 +1177,9 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + /* * No need to check the CPU as mpam_apply_config() doesn't care, and * resctrl_arch_update_domains() relies on this. @@ -1227,6 +1245,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) lockdep_assert_cpus_held(); lockdep_assert_irqs_enabled(); + if (!mpam_is_enabled()) + return -EINVAL; + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { struct resctrl_staged_config *cfg = &d->staged_config[t]; @@ -1619,7 +1640,11 @@ int mpam_resctrl_setup(void) return -EOPNOTSUPP; } - /* TODO: call resctrl_init() */ + err = resctrl_init(); + if (err) + return err; + + WRITE_ONCE(resctrl_enabled, true); return 0; @@ -1629,6 +1654,42 @@ int mpam_resctrl_setup(void) return err; } +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + might_sleep(); + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == class) { + res->class = NULL; + break; + } + } + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == class) { + mon->class = NULL; + break; + } + } +} + static int __init __cacheinfo_ready(void) { cacheinfo_ready = true; From ece939c0fd52b360dc1c8b8036d962d8852d2c8a Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:12 +0000 Subject: [PATCH 035/115] arm_mpam: Add quirk framework The MPAM specification includes the MPAMF_IIDR, which serves to uniquely identify the MSC implementation through a combination of implementer details, product ID, variant, and revision. Certain hardware issues/errata can be resolved using software workarounds. Introduce a quirk framework to allow workarounds to be enabled based on the MPAMF_IIDR value. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Zeng Heng Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Co-developed-by: James Morse Signed-off-by: James Morse (cherry picked from commit fa7745218c9828ac4849ef62bccad684aec0f422) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 32 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 25 +++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 2c65e4c46ed56..324c105b28614 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -630,6 +630,30 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static const struct mpam_quirk mpam_quirks[] = { + { NULL } /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + int err = 0; + + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + err = quirk->init(msc, quirk); + + if (err) + continue; + + mpam_set_quirk(quirk->workaround, msc); + } +} + /* * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour * of NRDY, software can use this bit for any purpose" - so hardware might not @@ -864,8 +888,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) /* Grab an IDR value to find out how many RIS there are */ mutex_lock(&msc->part_sel_lock); idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); mutex_unlock(&msc->part_sel_lock); + mpam_enable_quirks(msc); + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); /* Use these values so partid/pmg always starts with a valid value */ @@ -1974,6 +2001,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props) * resulting safe value must be compatible with both. When merging values in * the tree, all the aliasing resources must be handled first. * On mismatch, parent is modified. + * Quirks on an MSC will apply to all MSC in that class. */ static void __props_mismatch(struct mpam_props *parent, struct mpam_props *child, bool alias) @@ -2093,6 +2121,7 @@ static void __props_mismatch(struct mpam_props *parent, * nobble the class feature, as we can't configure all the resources. * e.g. The L3 cache is composed of two resources with 13 and 17 portion * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. */ static void __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) @@ -2106,6 +2135,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", (long)cprops->features, (long)vprops->features); + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + /* Take the safe value for any common features */ __props_mismatch(cprops, vprops, false); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 68906c6ebfb01..01858365cd9e7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -85,6 +85,8 @@ struct mpam_msc { u8 pmg_max; unsigned long ris_idxs; u32 ris_max; + u32 iidr; + u16 quirks; /* * error_irq_lock is taken when registering/unregistering the error @@ -216,6 +218,28 @@ struct mpam_props { #define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features) +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + MPAM_QUIRK_LAST +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, @@ -259,6 +283,7 @@ struct mpam_class { struct mpam_props props; u32 nrdy_usec; + u16 quirks; u8 level; enum mpam_class_types type; From c7e5a30d16c87413958e02868aa004f850cb7cd3 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:13 +0000 Subject: [PATCH 036/115] arm_mpam: Add workaround for T241-MPAM-1 The MPAM bandwidth partitioning controls will not be correctly configured, and hardware will retain default configuration register values, meaning generally that bandwidth will remain unprovisioned. To address the issue, follow the below steps after updating the MBW_MIN and/or MBW_MAX registers. - Perform 64b reads from all 12 bridge MPAM shadow registers at offsets (0x360048 + slice*0x10000 + partid*8). These registers are read-only. - Continue iterating until all 12 shadow register values match in a loop. pr_warn_once if the values fail to match within the loop count 1000. - Perform 64b writes with the value 0x0 to the two spare registers at offsets 0x1b0000 and 0x1c0000. In the hardware, writes to the MPAMCFG_MBW_MAX MPAMCFG_MBW_MIN registers are transformed into broadcast writes to the 12 shadow registers. The final two writes to the spare registers cause a final rank of downstream micro-architectural MPAM registers to be updated from the shadow copies. The intervening loop to read the 12 shadow registers helps avoid a race condition where writes to the spare registers occur before all shadow registers have been updated. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Punit Agrawal Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 70e81fbedc6570b2397e07a645136af0a0eec907) Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 88 +++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 9 +++ 3 files changed, 99 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 4c300caad9011..a65620f98e3aa 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -247,6 +247,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 324c105b28614..ab83987dd6bc1 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,6 +29,16 @@ #include "mpam_internal.h" +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; + /* * mpam_list_lock protects the SRCU lists when writing. Once the * mpam_enabled key is enabled these lists are read-only, @@ -630,7 +640,45 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return -EINVAL; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return -EINVAL; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return -EINVAL; + + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); + + return 0; +} + static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241_1, + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, { NULL } /* Sentinel */ }; @@ -1378,6 +1426,44 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1457,6 +1543,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, PRI, pri_val); } + mpam_quirk_post_config_change(ris, partid, cfg); + mutex_unlock(&msc->part_sel_lock); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 01858365cd9e7..d9eb342ba2220 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -130,6 +130,9 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + /* Values only used on some platforms for quirks */ + u32 t241_id; + struct mpam_garbage garbage; }; @@ -220,6 +223,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, MPAM_QUIRK_LAST }; @@ -240,6 +244,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) +#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, From 8e795b74a210614f080f9e47a4445b3ca72de94f Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:14 +0000 Subject: [PATCH 037/115] arm_mpam: Add workaround for T241-MPAM-4 In the T241 implementation of memory-bandwidth partitioning, in the absence of contention for bandwidth, the minimum bandwidth setting can affect the amount of achieved bandwidth. Specifically, the achieved bandwidth in the absence of contention can settle to any value between the values of MPAMCFG_MBW_MIN and MPAMCFG_MBW_MAX. Also, if MPAMCFG_MBW_MIN is set zero (below 0.78125%), once a core enters a throttled state, it will never leave that state. The first issue is not a concern if the MPAM software allows to program MPAMCFG_MBW_MIN through the sysfs interface. This patch ensures program MBW_MIN=1 (0.78125%) whenever MPAMCFG_MBW_MIN=0 is programmed. In the scenario where the resctrl doesn't support the MBW_MIN interface via sysfs, to achieve bandwidth closer to MBW_MAX in the absence of contention, software should configure a relatively narrow gap between MBW_MIN and MBW_MAX. The recommendation is to use a 5% gap to mitigate the problem. Clear the feature MBW_MIN feature from the class to ensure we don't accidentally change behaviour when resctrl adds support for a MBW_MIN interface. Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit a7efe23ed6dd08259ad1b238e9c33bb511666fd4) Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 2 + drivers/resctrl/mpam_devices.c | 55 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index a65620f98e3aa..a4b246655e37e 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -249,6 +249,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index ab83987dd6bc1..7a8623b27f063 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -679,6 +679,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_SCRUB_SHADOW_REGS, }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, { NULL } /* Sentinel */ }; @@ -1464,6 +1470,37 @@ static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, mpam_apply_t241_erratum(ris, partid); } +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1504,9 +1541,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); } - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, 0); + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { if (mpam_has_feature(mpam_feat_mbw_max, cfg)) @@ -2290,6 +2336,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d9eb342ba2220..f1adbdad39696 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -224,6 +224,7 @@ struct mpam_props { /* Workaround bits for msc->quirks */ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, MPAM_QUIRK_LAST }; From 10792e86356950a4941c963859943d4955870c55 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Fri, 13 Mar 2026 14:46:15 +0000 Subject: [PATCH 038/115] arm_mpam: Add workaround for T241-MPAM-6 The registers MSMON_MBWU_L and MSMON_MBWU return the number of requests rather than the number of bytes transferred. Bandwidth resource monitoring is performed at the last level cache, where each request arrive in 64Byte granularity. The current implementation returns the number of transactions received at the last level cache but does not provide the value in bytes. Scaling by 64 gives an accurate byte count to match the MPAM specification for the MSMON_MBWU and MSMON_MBWU_L registers. This patch fixes the issue by reporting the actual number of bytes instead of the number of transactions from __ris_msmon_read(). Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Punit Agrawal Tested-by: Peter Newman Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Gavin Shan Signed-off-by: Shanker Donthineni Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit dc48eb1ff27cc3169c3c5cca5eb20645d04d9e22) Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ drivers/resctrl/mpam_devices.c | 26 +++++++++++++++++++-- drivers/resctrl/mpam_internal.h | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index a4b246655e37e..1aa3326bb3200 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -251,6 +251,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 7a8623b27f063..8b598f768c2c8 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -685,6 +685,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_FORCE_MBW_MIN_TO_ONE, }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, { NULL } /* Sentinel */ }; @@ -1146,7 +1152,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } -static u64 mpam_msmon_overflow_val(enum mpam_device_features type) +static u64 __mpam_msmon_overflow_val(enum mpam_device_features type) { /* TODO: implement scaling counters */ switch (type) { @@ -1161,6 +1167,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type) } } +static u64 mpam_msmon_overflow_val(enum mpam_device_features type, + struct mpam_msc *msc) +{ + u64 overflow_val = __mpam_msmon_overflow_val(type); + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + type != mpam_feat_msmon_mbwu_63counter) + overflow_val *= 64; + + return overflow_val; +} + static void __ris_msmon_read(void *arg) { u64 now; @@ -1251,13 +1269,17 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___VALUE, now); } + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + m->type != mpam_feat_msmon_mbwu_63counter) + now *= 64; + if (nrdy) break; mbwu_state = &ris->mbwu_state[ctx->mon]; if (overflow) - mbwu_state->correction += mpam_msmon_overflow_val(m->type); + mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc); /* * Include bandwidth consumed before the last hardware reset and diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index f1adbdad39696..8fea28c5fb852 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -225,6 +225,7 @@ struct mpam_props { enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, MPAM_QUIRK_LAST }; From 3b4acd320d9b5ec723e40ffee8fcd5ad53916c34 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 13 Mar 2026 14:46:16 +0000 Subject: [PATCH 039/115] arm_mpam: Quirk CMN-650's CSU NRDY behaviour CMN-650 is afflicted with an erratum where the CSU NRDY bit never clears. This tells us the monitor never finishes scanning the cache. The erratum document says to wait the maximum time, then ignore the field. Add a flag to indicate whether this is the final attempt to read the counter, and when this quirk is applied, ignore the NRDY field. This means accesses to this counter will always retry, even if the counter was previously programmed to the same values. The counter value is not expected to be stable, it drifts up and down with each allocation and eviction. The CSU register provides the value for a point in time. Tested-by: Punit Agrawal Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Reviewed-by: Zeng Heng Reviewed-by: Gavin Shan Co-developed-by: Ben Horgan Signed-off-by: Ben Horgan Signed-off-by: James Morse (cherry picked from commit aeb8595a5f8ba4aac8b5c265a8bcc3f18b473cb5) Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/silicon-errata.rst | 3 +++ drivers/resctrl/mpam_devices.c | 12 ++++++++++++ drivers/resctrl/mpam_internal.h | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 1aa3326bb3200..65ed6ea33751f 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -214,6 +214,9 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | SI L1 | #4311569 | ARM64_ERRATUM_4311569 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8b598f768c2c8..41b14344b16f2 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -691,6 +691,12 @@ static const struct mpam_quirk mpam_quirks[] = { .iidr_mask = MPAM_IIDR_MATCH_ONE, .workaround = T241_MBW_COUNTER_SCALE_64, }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = MPAM_IIDR_ARM_CMN_650, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, { NULL } /* Sentinel */ }; @@ -1003,6 +1009,7 @@ struct mon_read { enum mpam_device_features type; u64 *val; int err; + bool waited_timeout; }; static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) @@ -1249,6 +1256,10 @@ static void __ris_msmon_read(void *arg) if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + break; case mpam_feat_msmon_mbwu_31counter: case mpam_feat_msmon_mbwu_44counter: @@ -1386,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, .ctx = ctx, .type = type, .val = val, + .waited_timeout = true, }; *val = 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 8fea28c5fb852..1914aefdcba9e 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -226,6 +226,7 @@ enum mpam_device_quirks { T241_SCRUB_SHADOW_REGS, T241_FORCE_MBW_MIN_TO_ONE, T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, MPAM_QUIRK_LAST }; @@ -251,6 +252,11 @@ struct mpam_quirk { FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) +#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, From 4664c58538aee21d0171bb211d805fd1b2c3e41e Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Fri, 13 Mar 2026 14:46:17 +0000 Subject: [PATCH 040/115] arm64: mpam: Add initial MPAM documentation MPAM (Memory Partitioning and Monitoring) is now exposed to user-space via resctrl. Add some documentation so the user knows what features to expect. Reviewed-by: Zeng Heng Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Acked-by: Catalin Marinas Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Tested-by: Gavin Shan Tested-by: Shaopeng Tan Tested-by: Jesse Chick Signed-off-by: James Morse (cherry picked from commit 4ce0a2ccc0358f3f746fa50815a599f861fd5d68) Signed-off-by: Fenghua Yu --- Documentation/arch/arm64/index.rst | 1 + Documentation/arch/arm64/mpam.rst | 72 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 Documentation/arch/arm64/mpam.rst diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index af52edc8c0ac6..98052b4ef4a1e 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -23,6 +23,7 @@ ARM64 Architecture memory memory-tagging-extension mops + mpam perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst new file mode 100644 index 0000000000000..570f51a8d4ebf --- /dev/null +++ b/Documentation/arch/arm64/mpam.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +MPAM +==== + +What is MPAM +============ +MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory +system components such as the caches or memory controllers that allow memory +traffic to be labelled, partitioned and monitored. + +Traffic is labelled by the CPU, based on the control or monitor group the +current task is assigned to using resctrl. Partitioning policy can be set +using the schemata file in resctrl, and monitor values read via resctrl. +See Documentation/filesystems/resctrl.rst for more details. + +This allows tasks that share memory system resources, such as caches, to be +isolated from each other according to the partitioning policy (so called noisy +neighbours). + +Supported Platforms +=================== +Use of this feature requires CPU support, support in the memory system +components, and a description from firmware of where the MPAM device controls +are in the MMIO address space. (e.g. the 'MPAM' ACPI table). + +The MMIO device that provides MPAM controls/monitors for a memory system +component is called a memory system component. (MSC). + +Because the user interface to MPAM is via resctrl, only MPAM features that are +compatible with resctrl can be exposed to user-space. + +MSC are considered as a group based on the topology. MSC that correspond with +the L3 cache are considered together, it is not possible to mix MSC between L2 +and L3 to 'cover' a resctrl schema. + +The supported features are: + +* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose + CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this + level that also supports the feature. Mismatched big/little platforms are + not supported as resctrl's controls would then also depend on task + placement. + +* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache. + resctrl uses the L3 cache-id to identify where the memory bandwidth + control is applied. For this reason the platform must have an L3 cache + with cache-id's supplied by firmware. (It doesn't need to support MPAM.) + + To be exported as the 'MB' schema, the topology of the group of MSC chosen + must match the topology of the L3 cache so that the cache-id's can be + repainted. For example: Platforms with Memory bandwidth maximum controls + on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these + nodes do not have a corresponding L3 cache. If the memory bandwidth + control is on the memory rather than the L3 then there must be a single + global L3 as otherwise it is unknown which L3 the traffic came from. There + must be no caches between the L3 and the memory so that the two ends of + the path have equivalent traffic. + + When the MPAM driver finds multiple groups of MSC it can use for the 'MB' + schema, it prefers the group closest to the L3 cache. + +* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided + there is at least one CSU monitor on each MSC that makes up the L3 group. + Exposing CSU counters from other caches or devices is not supported. + +Reporting Bugs +============== +If you are not seeing the counters or controls you expect please share the +debug messages produced when enabling dynamic debug and booting with: +dyndbg="file mpam_resctrl.c +pl" From bcb41697d432eb2a7b5dba8b6ddff25ac1ea32d6 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 24 Mar 2026 20:11:58 -0400 Subject: [PATCH 041/115] fs/resctrl: Report invalid domain ID when parsing io_alloc_cbm The last_cmd_status file is intended to report details about the most recent resctrl filesystem operation, specifically to aid in diagnosing failures. However, when parsing io_alloc_cbm, if a user provides a domain ID that does not exist in the resource, the operation fails with -EINVAL without updating last_cmd_status. This results in inconsistent behaviour where the system call returns an error, but last_cmd_status misleadingly reports "ok", leaving the user unaware that the failure was caused by an invalid domain ID. Write an error message to last_cmd_status when the target domain ID cannot be found. Fixes: 28fa2cce7a83 ("fs/resctrl: Introduce interface to modify io_alloc capacity bitmasks") Suggested-by: Reinette Chatre Signed-off-by: Aaron Tomlin Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Babu Moger Link: https://patch.msgid.link/20260325001159.447075-2-atomlin@atomlin.com (cherry picked from commit d06b8e7c97c3290e61006e30b32beb9e715fab82) Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index cc4237c57cbe4..2ef53161ce119 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -992,6 +992,7 @@ static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, } } + rdt_last_cmd_printf("Invalid domain %lu\n", dom_id); return -EINVAL; } From 3fe1d937265a7d3f1940fd48a609548c9a537c48 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 24 Mar 2026 20:11:59 -0400 Subject: [PATCH 042/115] fs/resctrl: Add "*" shorthand to set io_alloc CBM for all domains Configuring the io_alloc_cbm interface requires an explicit domain ID for each cache domain. On systems with high core counts and numerous cache clusters, this requirement becomes cumbersome for automation and management tasks that aim to apply a uniform policy. Introduce a wildcard domain ID selector "*" for the io_alloc_cbm interface. This enables users to set the same Capacity Bitmask (CBM) across all cache domains in a single operation. Signed-off-by: Aaron Tomlin Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Babu Moger Link: https://patch.msgid.link/20260325001159.447075-3-atomlin@atomlin.com (cherry picked from commit d2bf45d067c728b0fe6e8f99a7386b8291e391e3) Signed-off-by: Fenghua Yu --- Documentation/filesystems/resctrl.rst | 8 ++++++++ fs/resctrl/ctrlmondata.c | 21 +++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index ba609f8d4de57..b003bed339fdd 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -215,6 +215,14 @@ related to allocation: # cat /sys/fs/resctrl/info/L3/io_alloc_cbm 0=00ff;1=000f + An ID of "*" configures all domains with the provided CBM. + + Example on a system that does not require a minimum number of consecutive bits in the mask:: + + # echo "*=0" > /sys/fs/resctrl/info/L3/io_alloc_cbm + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=0;1=0 + When CDP is enabled "io_alloc_cbm" associated with the CDP_DATA and CDP_CODE resources may reflect the same values. For example, values read from and written to /sys/fs/resctrl/info/L3DATA/io_alloc_cbm may be reflected by diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 2ef53161ce119..9a7dfc48cb2e2 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -954,25 +954,34 @@ static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, struct resctrl_schema *s, u32 closid) { enum resctrl_conf_type peer_type; + unsigned long dom_id = ULONG_MAX; struct rdt_parse_data data; struct rdt_ctrl_domain *d; + bool update_all = false; char *dom = NULL, *id; - unsigned long dom_id; next: if (!line || line[0] == '\0') return 0; + if (update_all) { + rdt_last_cmd_puts("Configurations after global '*'\n"); + return -EINVAL; + } + dom = strsep(&line, ";"); id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { + + if (dom && !strcmp(id, "*")) { + update_all = true; + } else if (!dom || kstrtoul(id, 10, &dom_id)) { rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); return -EINVAL; } dom = strim(dom); list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { + if (update_all || d->hdr.id == dom_id) { data.buf = dom; data.mode = RDT_MODE_SHAREABLE; data.closid = closid; @@ -988,10 +997,14 @@ static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, &d->staged_config[s->conf_type], sizeof(d->staged_config[0])); } - goto next; + if (!update_all) + goto next; } } + if (update_all) + goto next; + rdt_last_cmd_printf("Invalid domain %lu\n", dom_id); return -EINVAL; } From 068050b12092039b9eb6af5926890760b1cd1f2e Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 7 Apr 2026 09:01:58 -0700 Subject: [PATCH 043/115] MAINTAINERS: Update resctrl entry The x86 maintainers handle the resctrl filesystem and x86 architectural resctrl code. Even so, the x86 maintainers are not part of the resctrl section and not returned when scripts/get_maintainer.pl is run on resctrl filesystem code. With patches flowing via x86 maintainers resctrl should also ensure it follows the tip rules. Add the x86 maintainer alias, x86@kernel.org, to the resctrl section to ensure x86 maintainers are included in associated resctrl submissions. Add a reference to the tip tree handbook to make it clear which rules resctrl follows. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/4c14dd82e81737c6413e10fe097475b1cc0886fc.1775576382.git.reinette.chatre@intel.com (cherry picked from commit c611752be9d73d12fca9b456a0b8f5c8409a2346) Signed-off-by: Fenghua Yu --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 85bbf2d242458..b9028c49b421e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22198,11 +22198,13 @@ F: tools/testing/selftests/net/rds/ RDT - RESOURCE ALLOCATION M: Tony Luck M: Reinette Chatre +M: x86@kernel.org R: Dave Martin R: James Morse R: Babu Moger L: linux-kernel@vger.kernel.org S: Supported +P: Documentation/process/maintainer-tip.rst F: Documentation/filesystems/resctrl.rst F: arch/x86/include/asm/resctrl.h F: arch/x86/kernel/cpu/resctrl/ From 573123304cb4734519a1c74fc084cd6bb0e08998 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 7 Apr 2026 09:01:59 -0700 Subject: [PATCH 044/115] fs/resctrl: Add missing return value descriptions Using the stricter "./tools/docs/kernel-doc -Wall -v" to verify proper formatting of documentation comments includes warnings related to return markup on functions that are omitted during the default verification checks. This stricter verification reports a couple of missing return descriptions in resctrl: Warning: .../fs/resctrl/rdtgroup.c:1536 No description found for return value of 'rdtgroup_cbm_to_size' Warning: .../fs/resctrl/rdtgroup.c:3131 No description found for return value of 'mon_get_kn_priv' Warning: .../fs/resctrl/rdtgroup.c:3523 No description found for return value of 'cbm_ensure_valid' Warning: .../fs/resctrl/monitor.c:238 No description found for return value of 'resctrl_find_cleanest_closid' Add the missing return descriptions. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/1c50b9f7c73251c007133590986f127e1af57780.1775576382.git.reinette.chatre@intel.com (cherry picked from commit 79727019ce3da234d877ec0cb6a3985f001e2b2d) Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 2 ++ fs/resctrl/rdtgroup.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 49f3f6b846b27..9fd901c78dc66 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -234,6 +234,8 @@ static struct rmid_entry *resctrl_find_free_rmid(u32 closid) * * When the CLOSID and RMID are independent numbers, the first free CLOSID will * be returned. + * + * Return: Free CLOSID on success, < 0 on failure. */ int resctrl_find_cleanest_closid(void) { diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5da305bd36c96..5dfdaa6f9d8ff 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1519,6 +1519,8 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, * * @cbm is unsigned long, even if only 32 bits are used to make the * bitmap functions work correctly. + * + * Return: Size (in bytes) of cache portion represented by CBM, 0 on failure. */ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, unsigned long cbm) @@ -3102,6 +3104,8 @@ static void rmdir_all_sub(void) * @mevt: The type of event file being created. * @do_sum: Whether SNC summing monitors are being created. Only set * when @rid == RDT_RESOURCE_L3. + * + * Return: Pointer to mon_data private data of the event, NULL on failure. */ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, struct mon_evt *mevt, @@ -3496,6 +3500,8 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, * resource group is initialized. The user can follow this with a * modification to the CBM if the default does not satisfy the * requirements. + * + * Return: A CBM that is valid for resource @r. */ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) { From f882f42d275c0c4457f671dfa435bf269c42b7c6 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Mon, 13 Apr 2026 17:00:41 +0800 Subject: [PATCH 045/115] arm_mpam: resctrl: Fix MBA CDP alloc_capable handling on unmount The code to set MBA's alloc_capable to true appears to be trying to restore alloc_capable on unmount. This can never work because resctrl_arch_set_cdp_enabled() is never invoked with RDT_RESOURCE_MBA as the rid parameter. Consequently, mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled always remains false. The alloc_capable setting in resctrl_arch_set_cdp_enabled() is to re-enable MBA if the caller opts in to separate control values using CDP for this resource. This doesn't happen today. Add a comment to describe this. However a bug remains where MBA allocation is permanently disabled after the mount with CDP option. Remounting without CDP cannot restore the MBA partition capability. Add a check to re-enable MBA when CDP is disabled, which happens on unmount. Fixes: 6789fb99282c ("arm_mpam: resctrl: Add CDP emulation") Signed-off-by: Zeng Heng [ morse: Added comment for existing code, added hunk to fix this bug from Ben H ] Reviewed-by: James Morse Signed-off-by: James Morse (cherry picked from commit f758340da529ccb12531c3f83d5992e912f6c8d5) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a9938006d0e6e..4205fb2ee312b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -220,10 +220,18 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + /* + * If resctrl has attempted to enable CDP on MBA, re-enable MBA as two + * configurations will be provided so there is no aliasing problem. + */ if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && mpam_resctrl_controls[RDT_RESOURCE_MBA].class) mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + /* On unmount when CDP is disabled, re-enable MBA */ + if (!cdp_enabled && mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + if (enable) { if (mpam_partid_max < 1) return -EINVAL; From 594306027b3e5b0ab150d883c76c3d155ddfb5f1 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Tue, 14 Apr 2026 14:27:56 +0100 Subject: [PATCH 046/115] arm_mpam: resctrl: Fix the check for no monitor components found Dan Carpenter reports that, in mpam_resctrl_alloc_domain(), any_mon_comp is used in an 'if' condition when it may be uninitialized. Initialize it to NULL so that the check behaves correctly when no monitor components are found. Reported-by: Dan Carpenter Fixes: 264c285999fc ("arm_mpam: resctrl: Add monitor initialisation and domain boilerplate") Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Signed-off-by: James Morse (cherry picked from commit 67c0a487efa542cca9477ea84915db2e091f98d0) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 4205fb2ee312b..1b0b37da12afc 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1407,7 +1407,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) } if (r->mon_capable) { - struct mpam_component *any_mon_comp; + struct mpam_component *any_mon_comp = NULL; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; From 4a62598f71fdf4d6b412063549bf760767ac3a33 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Tue, 14 Apr 2026 14:27:58 +0100 Subject: [PATCH 047/115] arm_mpam: resctrl: Make resctrl_mon_ctx_waiters static resctrl_mon_ctx_waiters is not used outside of this file, so make it static. This fixes the sparse warning: drivers/resctrl/mpam_resctrl.c:25:1: warning: symbol 'resctrl_mon_ctx_waiters' was not declared. Should it be static? Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202603281842.c2K96tJA-lkp@intel.com/ Fixes: 2a3c79c61539 ("arm_mpam: resctrl: Allow resctrl to allocate monitors") Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Signed-off-by: James Morse (cherry picked from commit 4d5bbbafc170eb21474a37d844211fce6b0f3c51) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 1b0b37da12afc..226ff6f532fab 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -22,7 +22,7 @@ #include "mpam_internal.h" -DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); +static DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); /* * The classes we've picked to map to resctrl resources, wrapped From 75d4de9e87cf9da328c121cd6afb13da716a81f2 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 30 Apr 2026 01:19:15 +0000 Subject: [PATCH 048/115] NVIDIA: SAUCE: Update annotations to set CONFIG_RESCTRL_FS Eanble resctrl by CONFIG_RESCTRL_FS=y Signed-off-by: Fenghua Yu --- debian.nvidia/config/annotations | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian.nvidia/config/annotations b/debian.nvidia/config/annotations index a97ed74f91e3d..de590fc4d4b9c 100644 --- a/debian.nvidia/config/annotations +++ b/debian.nvidia/config/annotations @@ -213,6 +213,8 @@ CONFIG_VFIO_CONTAINER note<'LP: #2095028'> CONFIG_VFIO_IOMMU_TYPE1 policy<{'amd64': 'm', 'arm64': '-'}> CONFIG_VFIO_IOMMU_TYPE1 note<'LP: #2095028'> +CONFIG_RESCTRL_FS policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_RESCTRL_FS note<'LP: #2122432'> # ---- Annotations without notes ---- From c673322fc7253fe9702fe5db1d1eaeb4b3672716 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 7 Sep 2021 17:21:42 +0100 Subject: [PATCH 049/115] NVIDIA: VR: SAUCE: untested: arm_mpam: resctrl: pick classes for use as mbm counters resctrl has two types of counters, NUMA-local and global. MPAM has only bandwidth counters, but the position of the MSC may mean it counts NUMA-local, or global traffic. But the topology information is not available. Apply a heuristic: the L2 or L3 supports bandwidth monitors, these are probably NUMA-local. If the memory controller supports bandwidth monitors, they are probably global. This also allows us to assert that we don't have the same class backing two different resctrl events. Because the class or component backing the event may not be 'the L3', it is necessary for mpam_resctrl_get_domain_from_cpu() to search the monitor domains too. This matters the most for 'monitor only' systems, where 'the L3' control domains may be empty, and the ctrl_comp pointer NULL. Signed-off-by: James Morse (cherry picked from commit 40e0b0792745d65ea76f7b28f2642c590fe4dd9a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - mon_comp[] is defined in upstream. Remove its definition in this patch. - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 226ff6f532fab..a5463c59910b4 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -75,6 +75,14 @@ static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); */ static bool resctrl_enabled; +/* Whether this num_mbw_mon could result in a free_running system */ +static int __mpam_monitors_free_running(u16 num_mbwu_mon) +{ + if (num_mbwu_mon >= resctrl_arch_system_num_rmid_idx()) + return resctrl_arch_system_num_rmid_idx(); + return 0; +} + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -606,6 +614,24 @@ static bool cache_has_usable_csu(struct mpam_class *class) return true; } +static bool class_has_usable_mbwu(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return false; + + /* + * resctrl expects the bandwidth counters to be free running, + * which means we need as many monitors as resctrl has + * control/monitor groups. + */ + if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) + return false; + + return true; +} + /* * Calculate the worst-case percentage change from each implemented step * in the control. @@ -949,6 +975,7 @@ static void counter_update_class(enum resctrl_event_id evt_id, static void mpam_resctrl_pick_counters(void) { struct mpam_class *class; + bool has_mbwu; lockdep_assert_cpus_held(); @@ -983,7 +1010,37 @@ static void mpam_resctrl_pick_counters(void) break; } } + + has_mbwu = class_has_usable_mbwu(class); + if (has_mbwu && topology_matches_l3(class)) { + pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); + + /* + * MBWU counters may be 'local' or 'total' depending on + * where they are in the topology. Counters on caches + * are assumed to be local. If it's on the memory + * controller, its assumed to be global. + * TODO: check mbm_local matches NUMA boundaries... + */ + switch (class->type) { + case MPAM_CLASS_CACHE: + counter_update_class(QOS_L3_MBM_LOCAL_EVENT_ID, + class); + break; + case MPAM_CLASS_MEMORY: + counter_update_class(QOS_L3_MBM_TOTAL_EVENT_ID, + class); + break; + default: + break; + } + } } + + /* Allocation of MBWU monitors assumes that the class is unique... */ + if (mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class) + WARN_ON_ONCE(mpam_resctrl_counters[QOS_L3_MBM_LOCAL_EVENT_ID].class == + mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) From d653685bdf31eeaa35cf2a472e55b3e5c51e7c37 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Jul 2025 15:39:36 +0100 Subject: [PATCH 050/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Pre-allocate free running monitors When there are enough monitors, the resctrl mbm local and total files can be exposed. These need all the monitors that resctrl may use to be allocated up front. Add helpers to do this. If a different candidate class is discovered, the old array should be free'd and the allocated monitors returned to the driver. Signed-off-by: James Morse (cherry picked from commit 355bc5f578a4f17887f2574191c01fae5202abd7 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_internal.h | 8 ++- drivers/resctrl/mpam_resctrl.c | 92 +++++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1914aefdcba9e..963f7bf74ce6c 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -411,7 +411,13 @@ struct mpam_resctrl_res { struct mpam_resctrl_mon { struct mpam_class *class; - /* per-class data that resctrl needs will live here */ + /* + * Array of allocated MBWU monitors, indexed by (closid, rmid). + * When ABMC is not in use, this array directly maps (closid, rmid) + * to the allocated monitor. Otherwise this array is sparse, and + * un-assigned (closid, rmid) are -1. + */ + int *mbwu_idx_to_mon; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index a5463c59910b4..82801feb5211b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -626,10 +626,12 @@ static bool class_has_usable_mbwu(struct mpam_class *class) * which means we need as many monitors as resctrl has * control/monitor groups. */ - if (!__mpam_monitors_free_running(cprops->num_mbwu_mon)) - return false; + if (__mpam_monitors_free_running(cprops->num_mbwu_mon)) { + pr_debug("monitors usable in free-running mode\n"); + return true; + } - return true; + return false; } /* @@ -951,10 +953,58 @@ static void mpam_resctrl_pick_mba(void) } } +static void __free_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + if (array[i] < 0) + continue; + + mpam_free_mbwu_mon(class, array[i]); + array[i] = ~0; + } +} + +static int __alloc_mbwu_mon(struct mpam_class *class, int *array, + u16 num_mbwu_mon) +{ + for (int i = 0; i < num_mbwu_mon; i++) { + int mbwu_mon = mpam_alloc_mbwu_mon(class); + + if (mbwu_mon < 0) { + __free_mbwu_mon(class, array, num_mbwu_mon); + return mbwu_mon; + } + array[i] = mbwu_mon; + } + + return 0; +} + +static int *__alloc_mbwu_array(struct mpam_class *class, u16 num_mbwu_mon) +{ + int err; + size_t array_size = num_mbwu_mon * sizeof(int); + int *array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + + if (!array) + return ERR_PTR(-ENOMEM); + + memset(array, -1, array_size); + + err = __alloc_mbwu_mon(class, array, num_mbwu_mon); + if (err) + return ERR_PTR(err); + return_ptr(array); +} + static void counter_update_class(enum resctrl_event_id evt_id, struct mpam_class *class) { - struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evt_id]; + struct mpam_class *existing_class = mon->class; + u16 num_mbwu_mon = class->props.num_mbwu_mon; + int *existing_array = mon->mbwu_idx_to_mon; if (existing_class) { if (class->level == 3) { @@ -969,7 +1019,39 @@ static void counter_update_class(enum resctrl_event_id evt_id, } } - mpam_resctrl_counters[evt_id].class = class; + pr_debug("Updating event %u to use class %u\n", evt_id, class->level); + mon->class = class; + + if (evt_id == QOS_L3_OCCUP_EVENT_ID) + return; + + /* Might not need all the monitors */ + num_mbwu_mon = __mpam_monitors_free_running(num_mbwu_mon); + if (!num_mbwu_mon) { + pr_debug("Not pre-allocating free-running counters\n"); + return; + } + + /* + * This is the pre-allocated free-running monitors path. It always + * allocates one monitor per PARTID * PMG. + */ + WARN_ON_ONCE(num_mbwu_mon != resctrl_arch_system_num_rmid_idx()); + + mon->mbwu_idx_to_mon = __alloc_mbwu_array(class, num_mbwu_mon); + if (IS_ERR(mon->mbwu_idx_to_mon)) { + pr_debug("Failed to allocate MBWU array\n"); + mon->class = existing_class; + mon->mbwu_idx_to_mon = existing_array; + return; + } + + if (existing_array) { + pr_debug("Releasing previous class %u's monitors\n", + existing_class->level); + __free_mbwu_mon(existing_class, existing_array, num_mbwu_mon); + kfree(existing_array); + } } static void mpam_resctrl_pick_counters(void) From e520145bf91ca6f750d13c912d0d2b1e5eb5dcba Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 12:33:20 +0100 Subject: [PATCH 051/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Pre-allocate assignable monitors When there are not enough monitors, MPAM is able to emulate ABMC by making a smaller number of monitors assignable. These monitors still need to be allocated from the driver, and mapped to whichever control/monitor group resctrl wants to use them with. Add a second array to hold the monitor values indexed by resctrl's cntr_id. When CDP is in use, two monitors are needed so the available number of counters halves. Platforms witih one monitor will have zero monitors when CDP is in use. Signed-off-by: James Morse (cherry picked from commit d8a0ad3da1831147810bb58fc2459a6e36e26873 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_internal.h | 7 ++ drivers/resctrl/mpam_resctrl.c | 110 +++++++++++++++++++++++++++++--- 2 files changed, 107 insertions(+), 10 deletions(-) diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 963f7bf74ce6c..bee58b8347d3c 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -418,6 +418,13 @@ struct mpam_resctrl_mon { * un-assigned (closid, rmid) are -1. */ int *mbwu_idx_to_mon; + + /* + * Array of assigned MBWU monitors, indexed by idx argument. + * When ABMC is not in use, this array can be NULL. Otherwise + * it maps idx to the allocated monitor. + */ + int *assigned_counters; }; static inline int mpam_alloc_csu_mon(struct mpam_class *class) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 82801feb5211b..c88151e3454dd 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -75,6 +75,12 @@ static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); */ static bool resctrl_enabled; +/* + * L3 local/total may come from different classes - what is the number of MBWU + * 'on L3'? + */ +static unsigned int l3_num_allocated_mbwu = ~0; + /* Whether this num_mbw_mon could result in a free_running system */ static int __mpam_monitors_free_running(u16 num_mbwu_mon) { @@ -83,6 +89,15 @@ static int __mpam_monitors_free_running(u16 num_mbwu_mon) return 0; } +/* + * If l3_num_allocated_mbwu is forced below PARTID * PMG, then the counters + * are not free running, and ABMC's user-interface must be used to assign them. + */ +static bool mpam_resctrl_abmc_enabled(void) +{ + return l3_num_allocated_mbwu < resctrl_arch_system_num_rmid_idx(); +} + bool resctrl_arch_alloc_capable(void) { struct mpam_resctrl_res *res; @@ -146,16 +161,6 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, return -EOPNOTSUPP; } -bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) -{ - return false; -} - -int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) -{ - return -EINVAL; -} - int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { return -EOPNOTSUPP; @@ -193,6 +198,21 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +{ + l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + if (cdp_enabled) + l3->mon.num_mbm_cntrs /= 2; + + if (l3->mon.num_mbm_cntrs) { + l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + } else { + l3->mon.mbm_cntr_assignable = false; + l3->mon.mbm_assign_on_mkdir = false; + } +} + int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) { u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; @@ -252,6 +272,7 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); resctrl_reset_task_closids(); + mpam_resctrl_monitor_sync_abmc_vals(l3); for_each_possible_cpu(cpu) mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); @@ -631,6 +652,11 @@ static bool class_has_usable_mbwu(struct mpam_class *class) return true; } + if (cprops->num_mbwu_mon) { + pr_debug("monitors usable via ABMC assignment\n"); + return true; + } + return false; } @@ -978,6 +1004,8 @@ static int __alloc_mbwu_mon(struct mpam_class *class, int *array, array[i] = mbwu_mon; } + l3_num_allocated_mbwu = min(l3_num_allocated_mbwu, num_mbwu_mon); + return 0; } @@ -1125,6 +1153,23 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + return false; + + return mpam_resctrl_abmc_enabled(); +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + lockdep_assert_cpus_held(); + + WARN_ON_ONCE(1); + + return 0; +} + static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) { struct mpam_class *class = res->class; @@ -1202,6 +1247,41 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) return comp->comp_id; } +/* + * This must run after all event counters have been picked so that any free + * running counters have already been allocated. + */ +static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); + int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); + struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_class *class = mon->class; + u16 num_mbwu_mon; + + if (mon->mbwu_idx_to_mon) { + pr_debug("monitors free running\n"); + return 0; + } + + if (!rmid_array) { + pr_debug("Failed to allocate RMID array\n"); + return -ENOMEM; + } + memset(rmid_array, -1, array_size); + + num_mbwu_mon = class->props.num_mbwu_mon; + mon->assigned_counters = __alloc_mbwu_array(mon->class, num_mbwu_mon); + if (IS_ERR(mon->assigned_counters)) + return PTR_ERR(mon->assigned_counters); + mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); + + mpam_resctrl_monitor_sync_abmc_vals(l3); + + return 0; +} + static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { @@ -1249,6 +1329,16 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, if (resctrl_enable_mon_event(type, false, 0, NULL)) l3->mon_capable = true; + switch (type) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mpam_resctrl_monitor_init_abmc(mon); + + return 0; + default: + return 0; + } + return 0; } From e06479d310d8a0a3e24cf8e11491a7024b83ef95 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Oct 2025 14:31:11 +0100 Subject: [PATCH 052/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add kunit test for ABMC/CDP interactions ABMC exposes a fun corner case where a platform with one monitor can use ABMC for assignable counters - but not when CDP is enabled. Add some tests. Signed-off-by: James Morse (cherry picked from commit a861a0f40d75549387301244a228b519c86c063b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/test_mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/test_mpam_resctrl.c | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index b93d6ad87e43f..4145f057bd31a 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -296,6 +296,67 @@ static void test_percent_to_max_rounding(struct kunit *test) KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); } +static void test_num_assignable_counters(struct kunit *test) +{ + unsigned int orig_l3_num_allocated_mbwu = l3_num_allocated_mbwu; + u32 orig_mpam_partid_max = mpam_partid_max; + u32 orig_mpam_pmg_max = mpam_pmg_max; + bool orig_cdp_enabled = cdp_enabled; + struct rdt_resource fake_l3; + + /* Force there to be some PARTID/PMG */ + mpam_partid_max = 3; + mpam_pmg_max = 1; + + cdp_enabled = false; + + /* ABMC off, CDP off */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx()); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP off */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 4); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + cdp_enabled = true; + + /* ABMC off, CDP on */ + l3_num_allocated_mbwu = resctrl_arch_system_num_rmid_idx(); + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + + /* (value not consumed by resctrl) */ + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, resctrl_arch_system_num_rmid_idx() / 2); + + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC on, CDP on */ + l3_num_allocated_mbwu = 4; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 2); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_TRUE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* ABMC 'on', CDP on - but not enough counters */ + l3_num_allocated_mbwu = 1; + mpam_resctrl_monitor_sync_abmc_vals(&fake_l3); + KUNIT_EXPECT_EQ(test, fake_l3.mon.num_mbm_cntrs, 0); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_cntr_assignable); + KUNIT_EXPECT_FALSE(test, fake_l3.mon.mbm_assign_on_mkdir); + + /* Restore global variables that were messed with */ + l3_num_allocated_mbwu = orig_l3_num_allocated_mbwu; + mpam_partid_max = orig_mpam_partid_max; + mpam_pmg_max = orig_mpam_pmg_max; + cdp_enabled = orig_cdp_enabled; +} + static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), @@ -304,6 +365,7 @@ static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_percent_to_max_rounding), KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_num_assignable_counters), {} }; From 668c1243ab335c3ef8d5ee56661ed692aaaa8a53 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 15 Oct 2025 14:33:11 +0100 Subject: [PATCH 053/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add resctrl_arch_config_cntr() for ABMC use ABMC has a helper resctrl_arch_config_cntr() for changing the mapping between 'cntr_id' and a CLOSID/RMID pair. Add the helper. For MPAM this is done by updating the mon->mbwu_idx_to_mon[] array, and as usual CDP means it needs doing in three different ways. Signed-off-by: James Morse (cherry picked from commit ce6ad9dcc0fd43bb2a7558fdae6c11e96cf2f066 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Remove empty definition of resctrl_arch_config_cntr() - Resolve struct rdt_l3_mon_domain parameter in resctrl_arch_config_cntr() ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 43 +++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c88151e3454dd..c995725a735f7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -148,12 +148,6 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d { } -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - enum resctrl_event_id evtid, u32 rmid, u32 closid, - u32 cntr_id, bool assign) -{ -} - int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) @@ -1153,6 +1147,43 @@ static void mpam_resctrl_pick_counters(void) mpam_resctrl_counters[QOS_L3_MBM_TOTAL_EVENT_ID].class); } +static void __config_cntr(struct mpam_resctrl_mon *mon, u32 cntr_id, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, + bool assign) +{ + u32 mbwu_idx, mon_idx = resctrl_get_config_index(cntr_id, cdp_type); + + closid = resctrl_get_config_index(closid, cdp_type); + mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + WARN_ON_ONCE(mon_idx > l3_num_allocated_mbwu); + + if (assign) + mon->mbwu_idx_to_mon[mbwu_idx] = mon->assigned_counters[mon_idx]; + else + mon->mbwu_idx_to_mon[mbwu_idx] = -1; +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mon->mbwu_idx_to_mon || !mon->assigned_counters) { + pr_debug("monitor arrays not allocated\n"); + return; + } + + if (cdp_enabled) { + __config_cntr(mon, cntr_id, CDP_CODE, closid, rmid, assign); + __config_cntr(mon, cntr_id, CDP_DATA, closid, rmid, assign); + } else { + __config_cntr(mon, cntr_id, CDP_NONE, closid, rmid, assign); + } + + resctrl_arch_reset_rmid(r, d, closid, rmid, evtid); +} + bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) From 77c933a088a87c050f259d175e2e13245f958a50 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 25 Jun 2021 16:36:58 +0100 Subject: [PATCH 054/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add resctrl_arch_rmid_read() and resctrl_arch_reset_rmid() resctrl uses resctrl_arch_rmid_read() to read counters. CDP emulation means the counter may need reading in three different ways. The same goes for reset. The helpers behind the resctrl_arch_ functions will be re-used for the ABMC equivalent functions. Add the rounding helper for checking monitor values while we're here. Signed-off-by: James Morse (cherry picked from commit d45ffcb70f8a2c055b1b449b0a0780773cc5ca55 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - resctrl_arch_round_mon_val() has been defined in upstream. No need to re-define it here; - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 182 ++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 59 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index c995725a735f7..99aa9281a611f 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -137,11 +137,6 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai { } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id eventid) -{ -} - void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid) @@ -464,12 +459,49 @@ void resctrl_arch_mon_ctx_free(struct rdt_resource *r, resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); } -static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, - enum mpam_device_features mon_type, - int mon_idx, - enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + +static int +__read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) { - struct mon_cfg cfg; + struct mon_cfg cfg = { }; if (!mpam_is_enabled()) return -EINVAL; @@ -477,18 +509,29 @@ static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_c /* Shift closid to account for CDP */ closid = resctrl_get_config_index(closid, cdp_type); + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; + if (mon_idx == -1) { + if (mpam_resctrl_abmc_enabled()) { + /* Report Unassigned */ + return -ENOENT; + } + /* Report Unavailable */ + return -EINVAL; + } + } + + cfg.mon = mon_idx; + cfg.match_pmg = true; + cfg.partid = closid; + cfg.pmg = rmid; + if (irqs_disabled()) { /* Check if we can access this domain without an IPI */ return -EIO; } - cfg = (struct mon_cfg) { - .mon = mon_idx, - .match_pmg = true, - .partid = closid, - .pmg = rmid, - }; - return mpam_msmon_read(mon_comp, &cfg, mon_type, val); } @@ -497,29 +540,27 @@ static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component int mon_idx, u32 closid, u32 rmid, u64 *val) { if (cdp_enabled) { - u64 code_val = 0, data_val = 0; + u64 cdp_val = 0; int err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, - CDP_CODE, closid, rmid, &code_val); + CDP_CODE, closid, rmid, &cdp_val); if (err) return err; err = __read_mon(mon, mon_comp, mon_type, mon_idx, - CDP_DATA, closid, rmid, &data_val); - if (err) - return err; - - *val += code_val + data_val; - return 0; + CDP_DATA, closid, rmid, &cdp_val); + if (!err) + *val += cdp_val; + return err; } return __read_mon(mon, mon_comp, mon_type, mon_idx, CDP_NONE, closid, rmid, val); } -/* MBWU when not in ABMC mode (not supported), and CSU counters. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, +/* MBWU when not in ABMC mode, and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, void *arch_priv, u64 *val, void *arch_mon_ctx) { @@ -531,58 +572,81 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, resctrl_arch_rmid_read_context_check(); - if (!mpam_is_enabled()) - return -EINVAL; - if (eventid >= QOS_NUM_EVENTS || !mon->class) return -EINVAL; l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); mon_comp = l3_dom->mon_comp[eventid]; - if (eventid != QOS_L3_OCCUP_EVENT_ID) + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + mon_type = mpam_feat_msmon_csu; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + mon_type = mpam_feat_msmon_mbwu; + break; + default: return -EINVAL; - - mon_type = mpam_feat_msmon_csu; + } return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, closid, rmid, val); } -/* - * The rmid realloc threshold should be for the smallest cache exposed to - * resctrl. - */ -static int update_rmid_limits(struct mpam_class *class) +static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) { - u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); - struct mpam_props *cprops = &class->props; - struct cacheinfo *ci; + struct mon_cfg cfg = { }; - lockdep_assert_cpus_held(); + if (!mpam_is_enabled()) + return; - if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) - return 0; + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); - /* - * Assume cache levels are the same size for all CPUs... - * The check just requires any online CPU and it can't go offline as we - * hold the cpu lock. - */ - ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); - if (!ci || ci->size == 0) { - pr_debug("Could not read cache size for class %u\n", - class->level); - return -EINVAL; + if (mon_idx == USE_PRE_ALLOCATED) { + int mbwu_idx = resctrl_arch_rmid_idx_encode(closid, rmid); + mon_idx = mon->mbwu_idx_to_mon[mbwu_idx]; } - if (!resctrl_rmid_realloc_limit || - ci->size < resctrl_rmid_realloc_limit) { - resctrl_rmid_realloc_limit = ci->size; - resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + if (mon_idx == -1) + return; + cfg.mon = mon_idx; + mpam_msmon_reset_mbwu(mon_comp, &cfg); +} + +static void reset_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + int mon_idx, u32 closid, u32 rmid) +{ + if (cdp_enabled) { + __reset_mon(mon, mon_comp, mon_idx, CDP_CODE, closid, rmid); + __reset_mon(mon, mon_comp, mon_idx, CDP_DATA, closid, rmid); + } else { + __reset_mon(mon, mon_comp, mon_idx, CDP_NONE, closid, rmid); } +} - return 0; +/* Called via IPI. Call with read_cpus_lock() held. */ +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + if (!mpam_is_enabled()) + return; + + /* Only MBWU counters are relevant, and for supported event types. */ + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } static bool cache_has_usable_cpor(struct mpam_class *class) From c0fe87972f3ef0a07b3a9595e14f5b6acec3950e Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 26 Aug 2025 16:05:07 +0100 Subject: [PATCH 055/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add resctrl_arch_cntr_read() & resctrl_arch_reset_cntr() When used in ABMC mode, resctrl uses a different set of helpers to read and reset the counters. Add these. Signed-off-by: James Morse (cherry picked from commit 81af700d29ca8d39ed835ad1cee1ab8095517a9d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Remove empty definitions of resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() - Resolve struct rdt_l3_mon_domain parameter in resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 56 ++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 99aa9281a611f..45d1d6121c49c 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -137,19 +137,6 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domai { } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 closid, u32 rmid, int cntr_id, - enum resctrl_event_id eventid) -{ -} - -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, - u32 unused, u32 rmid, int cntr_id, - enum resctrl_event_id eventid, u64 *val) -{ - return -EOPNOTSUPP; -} - int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { return -EOPNOTSUPP; @@ -594,6 +581,28 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, closid, rmid, val); } +/* MBWU counters when in ABMC mode */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int mon_idx, + enum resctrl_event_id eventid, u64 *val) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return -EINVAL; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return -EINVAL; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + return read_mon_cdp_safe(mon, mon_comp, mpam_feat_msmon_mbwu, mon_idx, + closid, rmid, val); +} + static void __reset_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, int mon_idx, enum resctrl_conf_type cdp_type, u32 closid, u32 rmid) @@ -649,6 +658,27 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); } +/* Reset an assigned counter */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + + if (!mpam_is_enabled()) + return; + + if (eventid == QOS_L3_OCCUP_EVENT_ID || !mon->class) + return; + + l3_dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom); + mon_comp = l3_dom->mon_comp[eventid]; + + reset_mon_cdp_safe(mon, mon_comp, USE_PRE_ALLOCATED, closid, rmid); +} + static bool cache_has_usable_cpor(struct mpam_class *class) { struct mpam_props *cprops = &class->props; From bc1ce80fc011ab3029395e63d75d6c4d67e79dc7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:20:48 +0100 Subject: [PATCH 056/115] NVIDIA: VR: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and closid_num_dirty_rmid[] On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s closid_num_dirty_rmid[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Pull the IS_ENABLED() Kconfig checks into a helper and additionally check that the array has been allocated. This will cause callers to fallback to the regular CLOSID allocation strategy. Signed-off-by: James Morse (cherry picked from commit b9be9ec43910a549fb4f5eaced3bffcebc6a180e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/monitor.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 9fd901c78dc66..65a761fb4c1ec 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -113,6 +113,20 @@ static inline struct rmid_entry *__rmid_entry(u32 idx) return entry; } +static bool __has_closid_num_dirty_rmid_array(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return false; + + /* + * Avoid a race with dom_data_exit() freeing the array under + * rdtgroup_mutex. + */ + return closid_num_dirty_rmid; +} + static void limbo_release_entry(struct rmid_entry *entry) { lockdep_assert_held(&rdtgroup_mutex); @@ -120,7 +134,7 @@ static void limbo_release_entry(struct rmid_entry *entry) rmid_limbo_count--; list_add_tail(&entry->list, &rmid_free_lru); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]--; } @@ -244,7 +258,7 @@ int resctrl_find_cleanest_closid(void) lockdep_assert_held(&rdtgroup_mutex); - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (!__has_closid_num_dirty_rmid_array()) return -EIO; for (i = 0; i < closids_supported(); i++) { @@ -317,7 +331,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) } rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + if (__has_closid_num_dirty_rmid_array()) closid_num_dirty_rmid[entry->closid]++; } From 465b8ee3710b9faadef11876f99e9bb9160fc3fc Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 10 Jun 2024 17:41:58 +0100 Subject: [PATCH 057/115] NVIDIA: VR: SAUCE: fs/resctrl: Avoid a race with dom_data_exit() and rmid_ptrs[] On MPAM systems if an error occurs the arhictecture code will call resctrl_exit(). This calls dom_data_exit() which takes the rdrgroup_mutex and kfree()s rmid_ptrs[]. It is possible that another syscall tries to access that same array in the meantime, but is blocked on the mutex. Once dom_data_exit() completes, that syscall will see a NULL pointer. Make __rmid_entry() return NULL in this case. Neither __check_limbo() nor free_rmid() return an error, and can silently stop their work if this occurs. dom_data_init() has only just allocated the array and still holds the lock, so __rmid_entry() should never return NULL here. Signed-off-by: James Morse (cherry picked from commit c1ac3a4e7a0d09175fb84eb7be2b7b23e8c09f09 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/monitor.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 65a761fb4c1ec..e338b8d484054 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -98,12 +98,17 @@ unsigned int resctrl_rmid_realloc_limit; * * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code * must accept an attempt to read every index. + * + * Returns NULL if the rmid_ptrs[] array is not allocated. */ static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; u32 closid, rmid; + if (!rmid_ptrs) + return NULL; + entry = &rmid_ptrs[idx]; resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); @@ -175,6 +180,8 @@ void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) break; entry = __rmid_entry(idx); + if (!entry) + break; if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { @@ -353,6 +360,8 @@ void free_rmid(u32 closid, u32 rmid) return; entry = __rmid_entry(idx); + if (!entry) + return; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); @@ -959,6 +968,7 @@ int setup_rmid_lru_list(void) idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); entry = __rmid_entry(idx); + WARN_ON_ONCE(!entry); list_del(&entry->list); return 0; From 7ce35e0ad620b5a524cfd6c36adc8dbc81dfe105 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 22 Dec 2022 17:01:52 +0000 Subject: [PATCH 058/115] NVIDIA: VR: SAUCE: debugfs: Add helpers for creating cpumask entries in debugfs debugfs has handy helpers to make a bool, integer or string available through debugfs. Add helpers to do the same for cpumasks. These are read only. CC: Ben Horgan Signed-off-by: James Morse (cherry picked from commit 25c2e6fafcfd8044ea148672d3e6b4b29be0d756 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/debugfs/file.c | 64 +++++++++++++++++++++++++++++++++++++++++ include/linux/debugfs.h | 6 ++++ 2 files changed, 70 insertions(+) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 3376ab6a519d1..5c01b8056317a 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1140,6 +1140,70 @@ void debugfs_create_str(const char *name, umode_t mode, &fops_str_ro, &fops_str_wo); } +static ssize_t debugfs_read_file_cpumask(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + struct cpumask *cpumask; + char *kernel_buf; + ssize_t ret; + int len; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + /* How long is a piece of string? */ + kernel_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kernel_buf) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + cpumask = (struct cpumask *)file->private_data; + len = scnprintf(kernel_buf, PAGE_SIZE, + "%*pb\n", cpumask_pr_args(cpumask)); + debugfs_file_put(dentry); + if (len + 1 >= PAGE_SIZE) { + kfree(kernel_buf); + return -EIO; + } + + ret = simple_read_from_buffer(user_buf, count, ppos, kernel_buf, len); + kfree(kernel_buf); + + return ret; +} + +static const struct file_operations fops_cpumask_ro = { + .read = debugfs_read_file_cpumask, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_cpumask - create a read-only debugfs file that is used to read a cpumask + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. + */ +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ + /* Only read-only is supported */ + WARN_ON_ONCE(mode & S_IWUGO); + + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_cpumask_ro, + &fops_cpumask_ro, &fops_cpumask_ro); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 4177c47382826..591d4b7267d8b 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -202,6 +202,8 @@ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value); void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value); +void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value); struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, @@ -407,6 +409,10 @@ static inline void debugfs_create_str(const char *name, umode_t mode, char **value) { } +static inline void debugfs_create_cpumask(const char *name, umode_t mode, + struct dentry *parent, struct cpumask *value) +{ } + static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) From 9acc219bab129368285b7a310bf68d4f12aa1053 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 1 Sep 2021 15:13:12 +0100 Subject: [PATCH 059/115] NVIDIA: VR: SAUCE: arm_mpam: Add debugfs entries to show the MSC/RIS the driver discovered Not all of MPAM is visible through the resctrl user-space interface. To make it easy to debug why certain devices were not exposed through resctrl, allow the properties of the devices to be read through debugfs. This adds an mpam directory to debugfs, and exposes the devices as well as the hierarchy that was built. Signed-off-by: James Morse (cherry picked from commit e8f0f2147103bec25b367a273abfb7b6805df914 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 136 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 9 +++ 2 files changed, 136 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 41b14344b16f2..76daa280d2888 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -83,6 +83,8 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +static struct dentry *mpam_debugfs; + /* * Whether resctrl has been setup. Used by cpuhp in preference to * mpam_is_enabled(). The disable call after an error interrupt makes @@ -333,6 +335,8 @@ static void mpam_class_destroy(struct mpam_class *class) { lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(class->debugfs); + class->debugfs = NULL; list_del_rcu(&class->classes_list); add_to_garbage(class); } @@ -385,6 +389,8 @@ static void mpam_component_destroy(struct mpam_component *comp) __destroy_component_cfg(comp); + debugfs_remove_recursive(comp->debugfs); + comp->debugfs = NULL; list_del_rcu(&comp->class_list); add_to_garbage(comp); @@ -435,6 +441,8 @@ static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) lockdep_assert_held(&mpam_list_lock); + debugfs_remove_recursive(vmsc->debugfs); + vmsc->debugfs = NULL; list_del_rcu(&vmsc->comp_list); add_to_garbage(vmsc); @@ -594,6 +602,8 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); clear_bit(ris->ris_idx, &msc->ris_idxs); + debugfs_remove_recursive(ris->debugfs); + ris->debugfs = NULL; list_del_rcu(&ris->msc_list); list_del_rcu(&ris->vmsc_list); add_to_garbage(ris); @@ -770,32 +780,32 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) /* Cache Capacity Partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { - u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + ris->ccap_idr = mpam_read_partsel_reg(msc, CCAP_IDR); - props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ris->ccap_idr); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_softlim, props); if (props->cmax_wd && - !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmax, props); if (props->cmax_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cmin, props); - props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ris->ccap_idr); if (props->cassoc_wd && - FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ris->ccap_idr)) mpam_set_feature(mpam_feat_cmax_cassoc, props); } /* Cache Portion partitioning */ if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { - u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + ris->cpor_idr = mpam_read_partsel_reg(msc, CPOR_IDR); - props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, ris->cpor_idr); if (props->cpbm_wd) mpam_set_feature(mpam_feat_cpor_part, props); } @@ -1999,6 +2009,9 @@ static void mpam_msc_destroy(struct mpam_msc *msc) list_del_rcu(&msc->all_msc_list); platform_set_drvdata(pdev, NULL); + debugfs_remove_recursive(msc->debugfs); + msc->debugfs = NULL; + add_to_garbage(msc); } @@ -2017,6 +2030,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) { int err; u32 tmp; + char name[20]; struct mpam_msc *msc; struct resource *msc_res; struct device *dev = &pdev->dev; @@ -2084,6 +2098,10 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) list_add_rcu(&msc->all_msc_list, &mpam_all_msc); platform_set_drvdata(pdev, msc); + snprintf(name, sizeof(name), "msc.%u", msc->id); + msc->debugfs = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_x32("max_nrdy_usec", 0400, msc->debugfs, &msc->nrdy_usec); + return msc; } @@ -2689,6 +2707,102 @@ static int mpam_allocate_config(void) return 0; } +static void mpam_debugfs_setup_ris(struct mpam_msc_ris *ris) +{ + char name[40]; + struct dentry *d; + struct mpam_props *rprops = &ris->props; + + snprintf(name, sizeof(name), "ris.%u", ris->ris_idx); + d = debugfs_create_dir(name, ris->vmsc->msc->debugfs); + debugfs_create_x64("mpamf_idr", 0400, d, &ris->idr); + debugfs_create_x32("mpamf_cpor_idr", 0400, d, &ris->cpor_idr); + debugfs_create_x32("mpamf_ccap_idr", 0400, d, &ris->ccap_idr); + debugfs_create_ulong("features", 0400, d, &rprops->features[0]); + debugfs_create_x16("cpbm_wd", 0400, d, &rprops->cpbm_wd); + debugfs_create_x16("mbw_pbm_bits", 0400, d, &rprops->mbw_pbm_bits); + debugfs_create_x16("num_csu_mon", 0400, d, &rprops->num_csu_mon); + debugfs_create_x16("num_mbwu_mon", 0400, d, &rprops->num_mbwu_mon); + debugfs_create_cpumask("affinity", 0400, d, &ris->affinity); + ris->debugfs = d; +} + +static void mpam_debugfs_setup_vmsc(struct mpam_component *comp, + struct mpam_vmsc *vmsc) +{ + u8 ris_idx; + char name[40]; + char path[40]; + struct dentry *d; + struct mpam_msc_ris *ris; + int msc_id = vmsc->msc->id; + + snprintf(name, sizeof(name), "vmsc.%u", msc_id); + d = debugfs_create_dir(name, comp->debugfs); + debugfs_create_ulong("features", 0400, d, &vmsc->props.features[0]); + vmsc->debugfs = d; + + list_for_each_entry_rcu(ris, &vmsc->ris, vmsc_list) { + ris_idx = ris->ris_idx; + + snprintf(name, sizeof(name), "msc.%u_ris.%u", msc_id, + ris_idx); + snprintf(path, sizeof(path), "../../../msc.%u/ris.%u", + msc_id, ris_idx); + debugfs_create_symlink(name, d, path); + } +} + +static void mpam_debugfs_setup_comp(struct mpam_class *class, + struct mpam_component *comp) +{ + char name[40]; + struct dentry *d; + struct mpam_vmsc *vmsc; + + snprintf(name, sizeof(name), "comp.%u", comp->comp_id); + d = debugfs_create_dir(name, class->debugfs); + comp->debugfs = d; + + list_for_each_entry_rcu(vmsc, &comp->vmsc, comp_list) + mpam_debugfs_setup_vmsc(comp, vmsc); +} + +static void mpam_debugfs_setup(void) +{ + char name[40]; + struct dentry *d; + struct mpam_msc *msc; + struct mpam_class *class; + struct mpam_msc_ris *ris; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(msc, &mpam_all_msc, all_msc_list) { + d = msc->debugfs; + debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); + debugfs_create_x32("iface", 0400, d, &msc->iface); + debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + list_for_each_entry(ris, &msc->ris, msc_list) + mpam_debugfs_setup_ris(ris); + } + + list_for_each_entry_rcu(class, &mpam_classes, classes_list) { + snprintf(name, sizeof(name), "class.%u", class->level); + d = debugfs_create_dir(name, mpam_debugfs); + debugfs_create_ulong("features", 0400, d, &class->props.features[0]); + debugfs_create_x32("nrdy_usec", 0400, d, &class->nrdy_usec); + debugfs_create_x16("quirks", 0400, d, &class->quirks); + debugfs_create_x8("level", 0400, d, &class->level); + debugfs_create_cpumask("affinity", 0400, d, &class->affinity); + class->debugfs = d; + + list_for_each_entry_rcu(comp, &class->components, class_list) + mpam_debugfs_setup_comp(class, comp); + } +} + static void mpam_enable_once(void) { int err; @@ -2722,6 +2836,8 @@ static void mpam_enable_once(void) pr_err("Failed to allocate configuration arrays.\n"); break; } + + mpam_debugfs_setup(); } while (0); mutex_unlock(&mpam_list_lock); cpus_read_unlock(); @@ -2943,6 +3059,8 @@ static int __init mpam_msc_driver_init(void) return -EINVAL; } + mpam_debugfs = debugfs_create_dir("mpam", NULL); + return platform_driver_register(&mpam_msc_driver); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index bee58b8347d3c..ff860859a91e7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -130,6 +131,8 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + struct dentry *debugfs; + /* Values only used on some platforms for quirks */ u32 t241_id; @@ -310,6 +313,7 @@ struct mpam_class { struct ida ida_csu_mon; struct ida ida_mbwu_mon; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -344,6 +348,7 @@ struct mpam_component { /* parent: */ struct mpam_class *class; + struct dentry *debugfs; struct mpam_garbage garbage; }; @@ -362,12 +367,15 @@ struct mpam_vmsc { /* parent: */ struct mpam_component *comp; + struct dentry *debugfs; struct mpam_garbage garbage; }; struct mpam_msc_ris { u8 ris_idx; u64 idr; + u32 cpor_idr; + u32 ccap_idr; struct mpam_props props; bool in_reset_state; @@ -385,6 +393,7 @@ struct mpam_msc_ris { /* msmon mbwu configuration is preserved over reset */ struct msmon_mbwu_state *mbwu_state; + struct dentry *debugfs; struct mpam_garbage garbage; }; From c4e649fea61c95180c10c7fcb9797bcd1ba68d91 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Jul 2024 17:05:31 +0100 Subject: [PATCH 060/115] NVIDIA: VR: SAUCE: arm_mpam: Add force-disable debugfs trigger MPAM has an error interrupt that can be triggered by an MSC when corrupt or out of range values are seen. The hardware only needs to raise an error interrupt if the error was detected, it is also permissible for the hardware to just use the corrupt or our of range value. All the reasons to raise an error indicate a software bug. When the error interrupt is triggered, the MPAM driver attempts to reset all the CPUs back to PARTID-0 and reset PARTID-0 to be unrestricted. This is done to ensure important tasks aren't accidentally given the performance of unimportant tasks. This teardown path in the driver is hard to trigger. Add a debugfs file to poke this manually. It is expected you have to reboot to make MPAM work again after this. Signed-off-by: James Morse (cherry picked from commit 2c4e1fed02be2c50642680f9d99a1c3424e5b7b6 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 76daa280d2888..0f45b26377f46 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2803,6 +2803,33 @@ static void mpam_debugfs_setup(void) } } +static int mpam_force_disable_show(struct seq_file *s, void *data) +{ + seq_puts(s, "Write 1 to this file to trigger an MPAM error.\n"); + return 0; +} + +static ssize_t mpam_force_disable_write(struct file *file, + const char __user *userbuf, size_t count, + loff_t *ppos) +{ + u32 user_val; + int err; + + err = kstrtou32_from_user(userbuf, count, 10, &user_val); + if (err) + return err; + + if (user_val == 1) { + mpam_disable_reason = "debugfs trigger"; + mpam_disable(NULL); + } + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(mpam_force_disable); + static void mpam_enable_once(void) { int err; @@ -2842,6 +2869,9 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + debugfs_create_file("force_disable", 0600, mpam_debugfs, NULL, + &mpam_force_disable_fops); + if (!err) { err = mpam_resctrl_setup(); if (err) From 906c88e57a686376f90c1d24620e332ed5c8d76c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Jul 2025 12:02:57 +0100 Subject: [PATCH 061/115] NVIDIA: VR: SAUCE: arm_mpam: Expose the number of NRDY retries in debugfs It's really popular to tie NRDY high, and then act surprised when the OS never reads the counters, because they aren't ready. The spec obliges hardware to clear this bit automatically before the firmware advertised timeout. To make it easier to find errant hardware, count the number of retries and expose that number in debugfs. Signed-off-by: James Morse (cherry picked from commit 4fa427c7f312e037a8080dffc62663664b976905 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 5 ++++- drivers/resctrl/mpam_internal.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0f45b26377f46..54b254453221d 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1313,8 +1313,10 @@ static void __ris_msmon_read(void *arg) } mpam_mon_sel_unlock(msc); - if (nrdy) + if (nrdy) { + msc->nrdy_retry_count++; m->err = -EBUSY; + } if (m->err) return; @@ -2784,6 +2786,7 @@ static void mpam_debugfs_setup(void) debugfs_create_u32("fw_id", 0400, d, &msc->pdev->id); debugfs_create_x32("iface", 0400, d, &msc->iface); debugfs_create_x32("mpamf_iidr", 0400, d, &msc->iidr); + debugfs_create_x64("nrdy_retry_count", 0400, d, &msc->nrdy_retry_count); list_for_each_entry(ris, &msc->ris, msc_list) mpam_debugfs_setup_ris(ris); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index ff860859a91e7..e27e96d48ce62 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -68,6 +68,7 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; u32 nrdy_usec; + u64 nrdy_retry_count; cpumask_t accessibility; bool has_extd_esr; From 39be102ddcc3543d82aaac703b8ce4750751b9b2 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:56 +0100 Subject: [PATCH 062/115] NVIDIA: VR: SAUCE: arm_mpam: Add resctrl_arch_round_bw() Add the required hook to pre-round a userspace memory bandwidth allocation percentage value to a value acceptable to the driver backend. For MPAM, no rounding is needed because the driver has all the information necessary for rounding the value when resctrl_arch_update_one() is called. So, just "round" the value to itself here. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 935611d607afe707a00b0311fdbb500b8acdd654 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/arm_mpam.h`; ] Signed-off-by: Fenghua Yu --- include/linux/arm_mpam.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index f92a36187a527..4ccf32fe07fd5 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include #include @@ -76,6 +77,19 @@ static inline void resctrl_arch_disable_mon(void) { } static inline void resctrl_arch_enable_alloc(void) { } static inline void resctrl_arch_disable_alloc(void) { } +struct resctrl_schema; + +struct rdt_resource; +static inline u32 resctrl_arch_round_bw(u32 val, + const struct rdt_resource *r __always_unused) +{ + /* + * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary + * context to round the incoming value correctly. + */ + return val; +} + static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) { return val; From 52bb8908d046f6de5d5a80bc8a068295f86585fc Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Aug 2025 15:43:55 +0100 Subject: [PATCH 063/115] NVIDIA: VR: SAUCE: fs/resctrl,x86/resctrl: Factor mba rounding to be per-arch The control value parser for the MB resource currently coerces the memory bandwidth percentage value from userspace to be an exact multiple of the bw_gran parameter. On MPAM systems, this results in somewhat worse-than-worst-case rounding, since bw_gran is in general only an approximation to the actual hardware granularity, and the hardware bandwidth allocation control value is not natively a percentage. Allow the arch to provide its own conversion that is appropriate for the hardware, and move the existing conversion to x86. This will avoid accumulated error from rounding the value twice on MPAM systems. Clarify the documentation, but avoid overly exact promises. Clamping to bw_min and bw_max still feels generic: leave it in the core code, for now. No functional change. Signed-off-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit cabdc680e1dde14521ab2a61ff32b525b3ba334e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- Documentation/filesystems/resctrl.rst | 7 +++---- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 6 ++++++ fs/resctrl/ctrlmondata.c | 2 +- include/linux/resctrl.h | 2 ++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b003bed339fdd..e9ff59c2e57e8 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -236,12 +236,11 @@ with respect to allocation: user can request. "bandwidth_gran": - The granularity in which the memory bandwidth + The approximate granularity in which the memory bandwidth percentage is allocated. The allocated b/w percentage is rounded off to the next - control step available on the hardware. The - available bandwidth control steps are: - min_bandwidth + N * bandwidth_gran. + control step available on the hardware. The available + steps are at least as small as this value. "delay_linear": Indicates if the delay scale is linear or diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index b20e705606b8f..d539e56c2b1f0 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,9 +16,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include "internal.h" +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +{ + return roundup(val, (unsigned long)r->membw.bw_gran); +} + int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 9a7dfc48cb2e2..0c02451c687b2 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -71,7 +71,7 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - *data = roundup(bw, (unsigned long)r->membw.bw_gran); + *data = resctrl_arch_round_bw(bw, r); return true; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 006e57fd7ca58..b236c4e9cb619 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -500,6 +500,8 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); +u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. From 17a769b16b03dd6a3e2de3242f226d21c098b521 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 29 Sep 2025 14:29:42 +0100 Subject: [PATCH 064/115] NVIDIA: VR: SAUCE: arm_mpam: Split the locking around the mon_sel registers The MSC MON_SEL register needs to be accessed from hardirq for the overflow interrupt, and when taking an IPI to access these registers on platforms where MSC are not accesible from every CPU. This makes an irqsave spinlock the obvious lock to protect these registers. On systems with SCMI mailboxes it must be able to sleep, meaning a mutex must be used. The SCMI platforms can't support an overflow interrupt. Clearly these two can't exist for one MSC at the same time. Split the existing helper into a raw spinlock and a mutex, named inner and outer. The outer lock must be taken in an a pre-emptible context befroe the inner lock can be taken. On systems with SCMI mailboxes where the MON_SEL accesses must sleep - the inner lock will fail tobe taken if the caller is unable to sleep. This will allow callers to fail withuot having to explicitly check the interface type of each MSC. Signed-off-by: James Morse (cherry picked from commit 46584f5584d0d2eb939b0ab0e43b93e6a0665096 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 50 +++++++++++++++++------- drivers/resctrl/mpam_internal.h | 68 ++++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 33 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 54b254453221d..5568e675a526d 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -745,7 +745,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return false; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | @@ -759,7 +759,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) _mpam_write_monsel_reg(msc, mon_reg, 0); now = _mpam_read_monsel_reg(msc, mon_reg); can_clear = !(now & MSMON___NRDY); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); return (!can_set || !can_clear); } @@ -883,7 +883,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu_xcl, props); /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); } @@ -917,7 +919,9 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) } /* Is NRDY hardware managed? */ + mpam_mon_sel_outer_lock(msc); hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + mpam_mon_sel_outer_unlock(msc); if (hw_managed) mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); @@ -1211,7 +1215,7 @@ static void __ris_msmon_read(void *arg) struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; - if (!mpam_mon_sel_lock(msc)) { + if (!mpam_mon_sel_inner_lock(msc)) { m->err = -EIO; return; } @@ -1311,7 +1315,7 @@ static void __ris_msmon_read(void *arg) default: m->err = -EINVAL; } - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); if (nrdy) { msc->nrdy_retry_count++; @@ -1335,6 +1339,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) struct mpam_msc *msc = vmsc->msc; struct mpam_msc_ris *ris; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { arg->ris = ris; @@ -1353,6 +1358,7 @@ static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) if (err) any_err = err; } + mpam_mon_sel_outer_unlock(msc); } return any_err; @@ -1435,18 +1441,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, srcu_read_lock_held(&mpam_srcu)) { if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) continue; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) continue; ris->mbwu_state[ctx->mon].correction = 0; ris->mbwu_state[ctx->mon].reset_on_next_read = true; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -1647,8 +1655,11 @@ static int mpam_restore_mbwu_state(void *_ris) u64 val; struct mon_read mwbu_arg; struct mpam_msc_ris *ris = _ris; + struct mpam_msc *msc = ris->vmsc->msc; struct mpam_class *class = ris->vmsc->comp->class; + mpam_mon_sel_outer_lock(msc); + for (i = 0; i < ris->props.num_mbwu_mon; i++) { if (ris->mbwu_state[i].enabled) { mwbu_arg.ris = ris; @@ -1660,10 +1671,12 @@ static int mpam_restore_mbwu_state(void *_ris) } } + mpam_mon_sel_outer_unlock(msc); + return 0; } -/* Call with MSC cfg_lock held */ +/* Call with MSC lock and outer mon_sel lock held */ static int mpam_save_mbwu_state(void *arg) { int i; @@ -1678,7 +1691,7 @@ static int mpam_save_mbwu_state(void *arg) mbwu_state = &ris->mbwu_state[i]; cfg = &mbwu_state->cfg; - if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + if (WARN_ON_ONCE(!mpam_mon_sel_inner_lock(msc))) return -EIO; mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | @@ -1703,7 +1716,7 @@ static int mpam_save_mbwu_state(void *arg) cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); mbwu_state->correction += val; mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } return 0; @@ -2609,11 +2622,13 @@ static void __destroy_component_cfg(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; - if (mpam_mon_sel_lock(msc)) { + mpam_mon_sel_outer_lock(msc); + if (mpam_mon_sel_inner_lock(msc)) { list_for_each_entry(ris, &vmsc->ris, vmsc_list) add_to_garbage(ris->mbwu_state); - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } + mpam_mon_sel_outer_unlock(msc); } } @@ -2660,6 +2675,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) mpam_reset_component_cfg(comp); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + int err = 0; struct mpam_msc *msc; struct mpam_msc_ris *ris; struct msmon_mbwu_state *mbwu_state; @@ -2668,6 +2684,7 @@ static int __allocate_component_cfg(struct mpam_component *comp) continue; msc = vmsc->msc; + mpam_mon_sel_outer_lock(msc); list_for_each_entry(ris, &vmsc->ris, vmsc_list) { if (!ris->props.num_mbwu_mon) continue; @@ -2676,16 +2693,21 @@ static int __allocate_component_cfg(struct mpam_component *comp) ris->props.num_mbwu_mon); if (!mbwu_state) { __destroy_component_cfg(comp); - return -ENOMEM; + err = -ENOMEM; + break; } init_garbage(&mbwu_state[0].garbage); - if (mpam_mon_sel_lock(msc)) { + if (mpam_mon_sel_inner_lock(msc)) { ris->mbwu_state = mbwu_state; - mpam_mon_sel_unlock(msc); + mpam_mon_sel_inner_unlock(msc); } } + mpam_mon_sel_outer_unlock(msc); + + if (err) + return err; } return 0; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e27e96d48ce62..02966f5b3b947 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -118,16 +118,20 @@ struct mpam_msc { /* * mon_sel_lock protects access to the MSC hardware registers that are * affected by MPAMCFG_MON_SEL, and the mbwu_state. - * Access to mon_sel is needed from both process and interrupt contexts, - * but is complicated by firmware-backed platforms that can't make any - * access unless they can sleep. - * Always use the mpam_mon_sel_lock() helpers. - * Accesses to mon_sel need to be able to fail if they occur in the wrong - * context. + * Both the 'inner' and 'outer' must be taken. + * For real MMIO MSC, the outer lock is unnecessary - but keeps the + * code common with: + * Firmware backed MSC need to sleep when accessing the MSC, which + * means some code-paths will always fail. For these MSC the outer + * lock is providing the protection, and the inner lock fails to + * be taken if the task is unable to sleep. + * * If needed, take msc->probe_lock first. */ - raw_spinlock_t _mon_sel_lock; - unsigned long _mon_sel_flags; + struct mutex outer_mon_sel_lock; + bool outer_lock_held; + raw_spinlock_t inner_mon_sel_lock; + unsigned long inner_mon_sel_flags; void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; @@ -140,30 +144,56 @@ struct mpam_msc { struct mpam_garbage garbage; }; -/* Returning false here means accesses to mon_sel must fail and report an error. */ -static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +static inline bool __must_check mpam_mon_sel_inner_lock(struct mpam_msc *msc) { - /* Locking will require updating to support a firmware backed interface */ - if (WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO)) - return false; + /* + * The outer lock may be taken by a CPU that then issues an IPI to run + * a helper that takes the inner lock. lockdep can't help us here. + */ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) { + raw_spin_lock_irqsave(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); + return true; + } + + /* Accesses must fail if we are not pre-emptible */ + return !!preemptible(); +} - raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); - return true; +static inline void mpam_mon_sel_inner_unlock(struct mpam_msc *msc) +{ + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + + if (msc->iface == MPAM_IFACE_MMIO) + raw_spin_unlock_irqrestore(&msc->inner_mon_sel_lock, msc->inner_mon_sel_flags); +} + +static inline void mpam_mon_sel_outer_lock(struct mpam_msc *msc) +{ + mutex_lock(&msc->outer_mon_sel_lock); + msc->outer_lock_held = true; } -static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +static inline void mpam_mon_sel_outer_unlock(struct mpam_msc *msc) { - raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); + msc->outer_lock_held = false; + mutex_unlock(&msc->outer_mon_sel_lock); } static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) { - lockdep_assert_held_once(&msc->_mon_sel_lock); + WARN_ON_ONCE(!READ_ONCE(msc->outer_lock_held)); + if (msc->iface == MPAM_IFACE_MMIO) + lockdep_assert_held_once(&msc->inner_mon_sel_lock); + else + lockdep_assert_preemption_enabled(); } static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) { - raw_spin_lock_init(&msc->_mon_sel_lock); + raw_spin_lock_init(&msc->inner_mon_sel_lock); + mutex_init(&msc->outer_mon_sel_lock); } /* Bits for mpam features bitmaps */ From 434f0c926091e41504d448305167d23f58dd5686 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 29 Oct 2021 16:13:51 +0100 Subject: [PATCH 065/115] NVIDIA: VR: SAUCE: arm_mpam: Allow the maximum partid to be overridden from the command line MPAMs bandwidth monitors are only available via resctrl if there are enough monitors for each combination of partid and pmg to have one. As it is unlikely anyone built that many monitors, allow the maximum partid the system will use to be set from the kernel command-line. With this, it should be possible for bandwidth monitors to be enabled by reducing the number of partid in use. Signed-off-by: James Morse (cherry picked from commit f12f00ec8d977b8ea8c78986ef34cd9c898e8b2b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 5568e675a526d..30b29ff987c1a 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ static DEFINE_MUTEX(mpam_cpuhp_state_lock); u16 mpam_partid_max; u8 mpam_pmg_max; static bool partid_max_init, partid_max_published; +static u16 mpam_cmdline_partid_max; +static bool mpam_cmdline_partid_max_overridden; static DEFINE_SPINLOCK(partid_max_lock); /* @@ -302,6 +305,9 @@ int mpam_register_requestor(u16 partid_max, u8 pmg_max) return -EBUSY; } + if (mpam_cmdline_partid_max_overridden) + mpam_partid_max = min(mpam_cmdline_partid_max, mpam_partid_max); + return 0; } EXPORT_SYMBOL(mpam_register_requestor); @@ -3122,6 +3128,38 @@ static int __init mpam_msc_driver_init(void) /* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ subsys_initcall(mpam_msc_driver_init); +static int mpam_cmdline_partid_max_set(const char *arg, + const struct kernel_param *kp) +{ + int ret; + + spin_lock(&partid_max_lock); + ret = kstrtou16(arg, 10, &mpam_cmdline_partid_max); + if (!ret) + mpam_cmdline_partid_max_overridden = true; + spin_unlock(&partid_max_lock); + + return 0; +} +static int mpam_cmdline_partid_max_get(char *buffer, + const struct kernel_param *kp) +{ + u16 val = 0xffff; + + spin_lock(&partid_max_lock); + if (mpam_cmdline_partid_max_overridden) + val = mpam_cmdline_partid_max; + spin_unlock(&partid_max_lock); + + return sprintf(buffer, "%u\n", val); +} +static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { + .set = mpam_cmdline_partid_max_set, + .get = mpam_cmdline_partid_max_get, +}; +module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); +MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif From f8daf5cbde06ac4771ac9c2c3c373a51a955fb17 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 1 Jul 2025 17:03:13 +0100 Subject: [PATCH 066/115] NVIDIA: VR: SAUCE: arm_mpam: Allow MSC to be forced to have an unknown location The MPAM driver discovers which MSC control which system resources from firmware tables. The MPAM resctrl picking code then attempts to export platforms that are Xeon shaped via resctrl. Occasionally, the presence of one or more MSC prevents the platform being described as Xeon shaped, and exposed via resctrl. For example with CPU-less NUMA nodes. The additional node doensn't have an L3, so can't have domain-ids exposed for the 'MB' memory bandwidth controls. In this example, some users would prefer to control bandwidth on just the CPU nodes, instead of having nothing at all. Allow users an amount of wiggle room by allowing MSC to be forced to be treated as unknown. This effectively disables parts of the MPAM functionality. Unknown MSC are not disabled, They are still probed and contribute to the system wide properties. Suggested-by: Dave Martin Signed-off-by: James Morse (cherry picked from commit 542e79e9f52b4a9889de0c586a9db2bed5ecfa03 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 64 +++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_internal.h | 2 ++ 2 files changed, 66 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 30b29ff987c1a..791df02bdefa4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include "mpam_internal.h" @@ -623,6 +625,9 @@ int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, { int err; + if (mpam_force_unknown_msc_test(msc)) + type = MPAM_CLASS_UNKNOWN; + mutex_lock(&mpam_list_lock); err = mpam_ris_create_locked(msc, ris_idx, type, class_id, component_id); @@ -3160,6 +3165,65 @@ static const struct kernel_param_ops mpam_cmdline_partid_max_ops = { module_param_cb(partid_max, &mpam_cmdline_partid_max_ops, NULL, 0644); MODULE_PARM_DESC(partid_max, "Override for reducing the number of PARTID."); +static DEFINE_XARRAY(mpam_force_unkown_msc); + +static void mpam_force_unknown_msc_add(u32 msc_id, gfp_t gfp) +{ + xa_store(&mpam_force_unkown_msc, msc_id, xa_mk_value(msc_id), gfp); +} + +bool mpam_force_unknown_msc_test(struct mpam_msc *msc) +{ + return !!xa_load(&mpam_force_unkown_msc, msc->pdev->id); +} + +static int mpam_force_unknown_msc_set(const char *_str, + const struct kernel_param *kp) +{ + int err; + u32 val; + char *tok, *iter; + char *str __free(kfree) = kstrdup(_str, GFP_KERNEL); + + iter = str; + do { + tok = strsep(&iter, ","); + err = kstrtou32(tok, 10, &val); + if (err) { + pr_err("Failed to parse commandline: %d\n", err); + break; + } + mpam_force_unknown_msc_add(val, GFP_KERNEL); + } while (iter); + + return 0; +} +static int mpam_force_unknown_msc_get(char *buffer, + const struct kernel_param *kp) +{ + unsigned long index, count = 0; + int result = 0; + void *entry; + + xa_for_each(&mpam_force_unkown_msc, index, entry) { + if (count) + result += sprintf(buffer + result, ","); + + result += sprintf(buffer + result, "%lu", index); + count += 1; + } + + result += sprintf(buffer + result, "\n"); + + return result; +} +static const struct kernel_param_ops mpam_force_unknown_msc_ops = { + .set = mpam_force_unknown_msc_set, + .get = mpam_force_unknown_msc_get, +}; +subsys_param_cb(force_unknown_msc, &mpam_force_unknown_msc_ops, NULL, 0644); +MODULE_PARM_DESC(force_unknown_msc, "Disabling a set of probed MSC."); + #ifdef CONFIG_MPAM_KUNIT_TEST #include "test_mpam_devices.c" #endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 02966f5b3b947..d17c7512d807a 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -524,6 +524,8 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +bool mpam_force_unknown_msc_test(struct mpam_msc *msc); + #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); From a0482194fb96cd5e8594fc9232f20486a76c0176 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 Sep 2022 18:00:40 +0100 Subject: [PATCH 067/115] NVIDIA: VR: SAUCE: fs/resctrl: Add this_is_not_abi mount option Some later things in the MPAM tree enable behaviour that resctrl doesn't have upstream. To make it clear to people using the out-of-tree code that they shouldn't be relying on this in user-space, add a mount option to enable this stuff. Signed-off-by: James Morse (cherry picked from commit 8bd00259ac52ebb244ced984c744135e8d7f4b7d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/internal.h | 3 +++ fs/resctrl/rdtgroup.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 1a9b29119f88f..740e32a7c78e2 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -42,6 +42,7 @@ struct rdt_fs_context { bool enable_cdpl3; bool enable_mba_mbps; bool enable_debug; + bool enable_abi_playground; }; static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) @@ -314,6 +315,8 @@ struct mbm_state { u32 prev_bw; }; +DECLARE_STATIC_KEY_FALSE(resctrl_abi_playground); + extern struct mutex rdtgroup_mutex; static inline const char *rdt_kn_name(const struct kernfs_node *kn) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5dfdaa6f9d8ff..543584ba0e694 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -87,6 +87,9 @@ enum resctrl_event_id mba_mbps_default_event; static bool resctrl_debug; +/* Enable wacky behaviour that is not supported upstream. */ +DEFINE_STATIC_KEY_FALSE(resctrl_abi_playground); + void rdt_last_cmd_clear(void) { lockdep_assert_held(&rdtgroup_mutex); @@ -2782,6 +2785,42 @@ static void schemata_list_destroy(void) } } +static void hack_file_mode(const char *name, u16 mode) +{ + struct rftype *rfts, *rft; + int len; + + mutex_lock(&rdtgroup_mutex); + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + rft->mode = mode; + } + + mutex_unlock(&rdtgroup_mutex); +} + +static void enable_abi_playground(void) +{ + static_key_enable(&resctrl_abi_playground.key); + + /* Make the tasks file read only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0444); +} + +static void disable_abi_playground(void) +{ + static_key_disable(&resctrl_abi_playground.key); + + /* Make the tasks file read/write only */ + if (IS_ENABLED(CONFIG_CGROUP_RESCTRL)) + hack_file_mode("tasks", 0644); +} + static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); @@ -2792,6 +2831,9 @@ static int rdt_get_tree(struct fs_context *fc) DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + if (ctx->enable_abi_playground) + enable_abi_playground(); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -2907,6 +2949,7 @@ enum rdt_param { Opt_cdpl2, Opt_mba_mbps, Opt_debug, + Opt_not_abi_playground, nr__rdt_params }; @@ -2915,6 +2958,13 @@ static const struct fs_parameter_spec rdt_fs_parameters[] = { fsparam_flag("cdpl2", Opt_cdpl2), fsparam_flag("mba_MBps", Opt_mba_mbps), fsparam_flag("debug", Opt_debug), + + /* + * Some of MPAM's out of tree code exposes things through resctrl + * that need much more discussion before they are considered for + * mainline. Add a mount option that can be used to hide these crimes. + */ + fsparam_flag("this_is_not_abi", Opt_not_abi_playground), {} }; @@ -2945,6 +2995,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_debug: ctx->enable_debug = true; return 0; + case Opt_not_abi_playground: + ctx->enable_abi_playground = true; + return 0; } return -EINVAL; @@ -3191,6 +3244,9 @@ static void rdt_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); + + if (static_branch_unlikely(&resctrl_abi_playground)) + disable_abi_playground(); } static struct file_system_type rdt_fs_type = { @@ -4243,6 +4299,9 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_debug) seq_puts(seq, ",debug"); + if (static_branch_unlikely(&resctrl_abi_playground)) + seq_puts(seq, ",this_is_not_abi"); + return 0; } From 7386548fb100d46e632539a5244fa37688bfa753 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 17 Sep 2021 13:19:13 +0100 Subject: [PATCH 068/115] NVIDIA: VR: SAUCE: iommu/arm-smmu-v3: Register SMMU capabilities with MPAM Traffic in the system can be tagged with a PARTID and PMG. Different requestors can support a different number of bits for these fields. Before MPAM can be used, the MPAM driver has to discover the minimum number of bits supported by any requestor, which affects the range of PARTID and PMG that can be used. Detect whether the SMMU supports MPAM, if it does provide the MPAM driver with the maximum PARTID and PMG values. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 254691aaeac0832fd3daa0bab0ec5ba18c93bdc2 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c`; ] Signed-off-by: Fenghua Yu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 6 +++++ 2 files changed, 34 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 821e7d3da07bb..fa71f04071299 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -4370,6 +4371,29 @@ static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg) hw_features, fw_features); } +static void arm_smmu_mpam_register_smmu(struct arm_smmu_device *smmu) +{ + u16 partid_max; + u8 pmg_max; + u32 reg; + + if (!IS_ENABLED(CONFIG_ARM64_MPAM)) + return; + + if (!(smmu->features & ARM_SMMU_FEAT_MPAM)) + return; + + reg = readl_relaxed(smmu->base + ARM_SMMU_MPAMIDR); + if (!reg) + return; + + partid_max = FIELD_GET(SMMU_MPAMIDR_PARTID_MAX, reg); + pmg_max = FIELD_GET(SMMU_MPAMIDR_PMG_MAX, reg); + + if (mpam_register_requestor(partid_max, pmg_max)) + smmu->features &= ~ARM_SMMU_FEAT_MPAM; +} + static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) { u32 reg; @@ -4517,6 +4541,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) smmu->features |= ARM_SMMU_FEAT_RANGE_INV; if (FIELD_GET(IDR3_FWB, reg)) smmu->features |= ARM_SMMU_FEAT_S2FWB; + if (FIELD_GET(IDR3_MPAM, reg)) + smmu->features |= ARM_SMMU_FEAT_MPAM; if (FIELD_GET(IDR3_BBM, reg) == 2) smmu->features |= ARM_SMMU_FEAT_BBML2; @@ -4582,6 +4608,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) if (arm_smmu_sva_supported(smmu)) smmu->features |= ARM_SMMU_FEAT_SVA; + arm_smmu_mpam_register_smmu(smmu); + dev_info(smmu->dev, "oas %lu-bit (features 0x%08x)\n", smmu->oas, smmu->features); return 0; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 3c6d65d36164f..c7f9c179bb1f2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -58,6 +58,7 @@ struct arm_vsmmu; #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_MPAM (1 << 7) #define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) #define IDR3_BBM GENMASK(12, 11) @@ -169,6 +170,10 @@ struct arm_vsmmu; #define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8 #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc +#define ARM_SMMU_MPAMIDR 0x130 +#define SMMU_MPAMIDR_PARTID_MAX GENMASK(15, 0) +#define SMMU_MPAMIDR_PMG_MAX GENMASK(23, 16) + #define ARM_SMMU_REG_SZ 0xe00 /* Common MSI config fields */ @@ -767,6 +772,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_HD (1 << 22) #define ARM_SMMU_FEAT_S2FWB (1 << 23) #define ARM_SMMU_FEAT_BBML2 (1 << 24) +#define ARM_SMMU_FEAT_MPAM (1 << 25) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) From d49f01485681f9799d2103d3761a35e49bd45c80 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 14 Sep 2021 17:57:42 +0100 Subject: [PATCH 069/115] NVIDIA: VR: SAUCE: iommu/arm-smmu-v3: Add mpam helpers to query and set state To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Use the properties in the STE, as these only apply to one stream. The MPAM driver also needs to know the maximum partid and pmg values that the SMMU can generate. This allows it to determine the system-wide common supported range of values. Add a helper to return this id register. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit d847012696d29c61687420d4a6621f1f9e9bf95d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/iommu/iommu.c`; - Resolve minor conflicts in `include/linux/iommu.h`; ] Signed-off-by: Fenghua Yu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 92 +++++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 ++ drivers/iommu/iommu.c | 6 ++ include/linux/iommu.h | 7 ++ 4 files changed, 112 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index fa71f04071299..b3047fd391b0d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3741,6 +3741,96 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, + u8 pmg) +{ + int i; + u32 sid; + unsigned long flags; + struct arm_smmu_ste *step; + struct iommu_domain *domain; + struct arm_smmu_device *smmu; + struct arm_smmu_master *master; + struct arm_smmu_cmdq_batch cmds; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_cmdq_ent cmd = { + .opcode = CMDQ_OP_CFGI_STE, + .cfgi = { + .leaf = true, + }, + }; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + smmu = smmu_domain->smmu; + + arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); + + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); + } + + master->partid = partid; + master->pmg = pmg; + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + arm_smmu_cmdq_batch_submit(smmu, &cmds); + + return 0; +} + +static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, + u8 *pmg) +{ + int err = -EINVAL; + unsigned long flags; + struct iommu_domain *domain; + struct arm_smmu_master *master; + struct arm_smmu_domain *smmu_domain; + struct arm_smmu_master_domain *master_domain; + + domain = iommu_get_domain_for_group(group); + smmu_domain = to_smmu_domain(domain); + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + return -EIO; + + if (!partid && !pmg) + return 0; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + master = master_domain->master; + if (master) { + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; + err = 0; + } + } + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + return err; +} + static const struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, @@ -3754,6 +3844,8 @@ static const struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .get_group_qos_params = arm_smmu_group_get_mpam, + .set_group_qos_params = arm_smmu_group_set_mpam, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .get_viommu_size = arm_smmu_get_viommu_size, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index c7f9c179bb1f2..8ba26c819846b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -275,6 +275,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_MEV (1UL << 19) #define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) +#define STRTAB_STE_1_S1MPAM (1UL << 26) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) #define STRTAB_STE_1_EATS_ABT 0UL @@ -305,6 +306,10 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +#define STRTAB_STE_4_PARTID GENMASK_ULL(31, 16) + +#define STRTAB_STE_5_PMG GENMASK_ULL(7, 0) + /* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ #define STRTAB_STE_0_NESTING_ALLOWED \ cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ @@ -856,6 +861,8 @@ struct arm_smmu_master { bool stall_enabled; unsigned int ssid_bits; unsigned int iopf_refcount; + u16 partid; + u8 pmg; }; /* SMMU private data for an IOMMU domain */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ee83850c70605..78f497b5d042e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2123,6 +2123,12 @@ void iommu_domain_free(struct iommu_domain *domain) } EXPORT_SYMBOL_GPL(iommu_domain_free); +struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group) +{ + return group->domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_group); + /* * Put the group's domain back to the appropriate core-owned domain - either the * standard kernel-mode DMA configuration or an all-DMA-blocked domain. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 555597b54083c..ae167bd251423 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -703,6 +703,12 @@ struct iommu_ops { struct iommu_domain *parent_domain, const struct iommu_user_data *user_data); + /* Per group IOMMU features */ + int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + u8 *perf_mon_grp); + int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + u8 perf_mon_grp); + const struct iommu_domain_ops *default_domain_ops; struct module *owner; struct iommu_domain *identity_domain; @@ -910,6 +916,7 @@ extern int iommu_attach_device(struct iommu_domain *domain, extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); +extern struct iommu_domain *iommu_get_domain_for_group(struct iommu_group *group); struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, From d8833a44faaca4d2ccea15bc6dab8b0cc50222c1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 12:07:30 +0100 Subject: [PATCH 070/115] NVIDIA: VR: SAUCE: iommu: Add helpers to get and set the QoS state To allow an iommu_group to be moved between resctrl groups as if it were a CPU thread, the mpam driver needs to be able to set the partid and pmg for the iommu_group. Add helpers that call the iommu driver's get/set methods for these parameters. Signed-off-by: James Morse (cherry picked from commit 630242d2001b19a0f214de47640202efc3d09260 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/iommu.h`; ] Signed-off-by: Fenghua Yu --- drivers/iommu/iommu.c | 76 +++++++++++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 15 +++++++++ 2 files changed, 91 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 78f497b5d042e..01e609bd9cc0e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4103,3 +4103,79 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return ret; } #endif /* CONFIG_IRQ_MSI_IOMMU */ + +/* + * iommu_group_set_qos_params() - Set the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group should use. + * @perf_mon_grp: the performance label all traffic from the group should use. + * + * Return: 0 on success, or an error. + */ +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->set_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_set_qos_params, "IOMMUFD_INTERNAL"); + +/* + * iommu_group_get_qos_params() - Get the QoS parameters for a group + * @group: the iommu group. + * @partition: the partition label all traffic from the group uses. + * @perf_mon_grp: the performance label all traffic from the group uses. + * + * Return: 0 on success, or an error. + */ +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + const struct iommu_ops *ops; + struct group_device *device; + int ret; + + mutex_lock(&group->mutex); + device = list_first_entry_or_null(&group->devices, typeof(*device), + list); + if (!device) { + ret = -ENODEV; + goto out_unlock; + } + + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + ret = ops->get_group_qos_params(group, partition, perf_mon_grp); + +out_unlock: + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_group_get_qos_params, "IOMMUFD_INTERNAL"); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ae167bd251423..2b91cb987645f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1196,6 +1196,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); ioasid_t iommu_alloc_global_pasid(struct device *dev); void iommu_free_global_pasid(ioasid_t pasid); +int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp); +int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp); /* PCI device reset functions */ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); @@ -1523,6 +1527,17 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) } static inline void iommu_free_global_pasid(ioasid_t pasid) {} +static inline int iommu_group_set_qos_params(struct iommu_group *group, + u16 partition, u8 perf_mon_grp) +{ + return -ENODEV; +} + +static inline int iommu_group_get_qos_params(struct iommu_group *group, + u16 *partition, u8 *perf_mon_grp) +{ + return -ENODEV; +} static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) { From edd415b2eb4f8fcedc4659796a0112c3726efa07 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:19:43 +0100 Subject: [PATCH 071/115] NVIDIA: VR: SAUCE: iommu: Add helpers to retrieve iommu_groups by id or kobject ARM SMMU with MPAM support are able to mark streams of traffic with the QoS labels MPAM uses. The user-space interface for MPAM is the resctrl filesystem, which allows threads to be moved between groups, its natural to do the same for iommu_groups. The resctrl interface lists threads, so will also need to list iommu_groups, it will be necessary to walk the list of iommu_groups. To ensure this matches what user-space sees via sysfs, it is best to walk the kobjects. When making a change, resctrl will only have the id of a group. To avoid walking the list of kobjects in this case, add iommu_group_get_by_id(). Signed-off-by: James Morse (cherry picked from commit 9b7dcc8fab78bf2545b02c53add7af27c21e5e90 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/iommu/iommu.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 12 ++++++++++++ 2 files changed, 46 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 01e609bd9cc0e..7de7f615cae2c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1109,6 +1109,40 @@ struct iommu_group *iommu_group_alloc(void) } EXPORT_SYMBOL_GPL(iommu_group_alloc); +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + struct iommu_group *group; + + if (!iommu_group_kset || !group_kobj) + return NULL; + + group = container_of(group_kobj, struct iommu_group, kobj); + + kobject_get(group->devices_kobj); + kobject_put(&group->kobj); + + return group; +} + +struct iommu_group *iommu_group_get_by_id(int id) +{ + struct kobject *group_kobj; + const char *name; + + if (!iommu_group_kset) + return NULL; + + name = kasprintf(GFP_KERNEL, "%d", id); + if (!name) + return NULL; + + group_kobj = kset_find_obj(iommu_group_kset, name); + kfree(name); + + return iommu_group_get_from_kobj(group_kobj); +} +EXPORT_SYMBOL_GPL(iommu_group_get_by_id); + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2b91cb987645f..e17b2201f5205 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -910,6 +910,8 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return iommu_paging_domain_alloc_flags(dev, 0); } +struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj); +extern struct iommu_group *iommu_group_get_by_id(int id); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1231,6 +1233,16 @@ static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) return ERR_PTR(-ENODEV); } +static inline struct iommu_group *iommu_group_get_from_kobj(struct kobject *group_kobj) +{ + return NULL; +} + +static inline struct iommu_group *iommu_group_get_by_id(int id) +{ + return NULL; +} + static inline void iommu_domain_free(struct iommu_domain *domain) { } From 71baec24a6fcc4d9c815bf47f1e0dc6406df17d3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 13:10:25 +0100 Subject: [PATCH 072/115] NVIDIA: VR: SAUCE: iommu: Add helper to retrieve iommu kset To walk the list of iommu groups visible in sysfs, resctrl needs access to iommu_group_kset. Expose it. Signed-off-by: James Morse (cherry picked from commit 99cc3d17db3cbf3038957b25429a98e2a6dd5a58 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/iommu/iommu.c | 5 +++++ include/linux/iommu.h | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7de7f615cae2c..5fd2f4d3beab2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1143,6 +1143,11 @@ struct iommu_group *iommu_group_get_by_id(int id) } EXPORT_SYMBOL_GPL(iommu_group_get_by_id); +struct kset *iommu_get_group_kset(void) +{ + return kset_get(iommu_group_kset); +} + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e17b2201f5205..c3ef55df73ccd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -970,6 +970,7 @@ extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); +struct kset *iommu_get_group_kset(void); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); int iommu_set_pgtable_quirks(struct iommu_domain *domain, @@ -1401,6 +1402,11 @@ static inline int iommu_group_id(struct iommu_group *group) return -ENODEV; } +static inline struct kset *iommu_get_group_kset(void) +{ + return NULL; +} + static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { From 032978dca060ff95898a581f5a8ee99a00787d99 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 16:45:41 +0100 Subject: [PATCH 073/115] NVIDIA: VR: SAUCE: kobject: Add kset_get_next_obj() to allow a kset to be walked To expose iommu_groups via the resctrl filesystem, the resctrl driver needs to be able to walk the list of iommu_groups. These are exposed via sysfs as a kset. Add kset_get_next_obj() to allow resctrl to walk the kobjects in the kset. Signed-off-by: James Morse (cherry picked from commit 10d03a8e2abf6eb69227b8674463d8a70ceb9c94 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- include/linux/kobject.h | 2 ++ lib/kobject.c | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index bcb5d4e320015..89d35cbc36a54 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -200,6 +200,8 @@ static inline const struct kobj_type *get_ktype(const struct kobject *kobj) struct kobject *kset_find_obj(struct kset *, const char *); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev); + /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ diff --git a/lib/kobject.c b/lib/kobject.c index 9c9ff0f5175fb..518d95cce9755 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -920,6 +920,27 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name) } EXPORT_SYMBOL_GPL(kset_find_obj); +struct kobject *kset_get_next_obj(struct kset *kset, struct kobject *prev) +{ + struct kobject *k; + + spin_lock(&kset->list_lock); + + if (!prev) + k = list_first_entry_or_null(&kset->list, typeof(*k), entry); + else + k = list_next_entry(prev, entry); + + if (list_entry_is_head(k, &kset->list, entry)) + k = NULL; + + kobject_get(k); + spin_unlock(&kset->list_lock); + kobject_put(prev); + + return k; +} + static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); From 1040b3a8f8e9ca7565908d23a4a60ed524c2c868 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 31 Jul 2023 09:51:04 +0100 Subject: [PATCH 074/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add iommu helpers to get/set the partid and pmg SMMU that support MPAM can be configured to use a particular partid and pmg for a stream. The assignment of an iommu_group and its corresponding streams should be done via resctrl. Add helpers similar to setting a closid/rmid on a task. We need the same shifting if the CPUs are using CDP. The SMMU only takes one partid, conceptually its always making data accesses. Signed-off-by: James Morse (cherry picked from commit af9d3e292738a626d784e463509344b3dde55880 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/Kconfig | 1 + drivers/resctrl/mpam_resctrl.c | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 672abea3b03cc..30f455dba5aa0 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -29,3 +29,4 @@ config ARM64_MPAM_RESCTRL_FS default y if ARM64_MPAM_DRIVER && RESCTRL_FS select RESCTRL_RMID_DEPENDS_ON_CLOSID select RESCTRL_ASSIGN_FIXED + select RESCTRL_IOMMU if ARM_SMMU_V3 diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 45d1d6121c49c..446b8496b9341 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,58 @@ bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) return (tsk_closid == closid) && (tsk_rmid == rmid); } +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid) +{ + u16 partid; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return 0; + + if (cdp_enabled) + partid = closid << 1; + else + partid = closid; + + return iommu_group_set_qos_params(group, partid, rmid); +} + +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid) +{ + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, NULL); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid); +} + +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + u8 pmg; + u16 partid; + int err = iommu_group_get_qos_params(group, &partid, &pmg); + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU)) + return false; + + if (err) + return false; + + if (cdp_enabled) + partid >>= 1; + + return (partid == closid) && (rmid == pmg); +} + struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) { if (l >= RDT_NUM_RESOURCES) From c6ab75560bb839f22b2f33ea8b167af00bb94dba Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Sep 2021 17:11:58 +0100 Subject: [PATCH 075/115] NVIDIA: VR: SAUCE: fs/resctrl: Add support for assigning iommu_groups to resctrl groups Arm's MPAM has support for assigning devices behind an IOMMU to a control or monitor group. This can be used for device-passthrough for a VM, or user-space drivers using VFIO to ensure the device is either in the same control group as the CPU threads. Alternatively, the iommu_group may be assigned to a different control group with preferential schema values. Extend the resctrl tasks file to include iommu_groups. These appear as 'iommu_group:0', where 0 is the group number that can be found from /sys/kernel/iommu_groups/. iommu_groups can be moved between resctrl groups by writing this string in the same way as tasks are moved. No state is preserved by resctrl, an iommu_group that disappears will no longer be listed as being part of a resctrl group. A new iommu_group will appear in the default group. Add helpers to list and move iommu_groups. Architecture specific helpers are used to apply the closid/rmid to the iommu_group due to the way MPAM emulates CDP. Tested-by: Amit Singh Tomar Signed-off-by: James Morse (cherry picked from commit 8a09f730ab48859282f518615f993df7f9ccba2a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/acpi/arm64/mpam.c`; - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu --- drivers/acpi/arm64/mpam.c | 91 +++++++++++++++++++++++++++++++++-- fs/resctrl/Kconfig | 6 +++ fs/resctrl/rdtgroup.c | 99 ++++++++++++++++++++++++++++++++++++++- include/linux/resctrl.h | 28 +++++++++++ 4 files changed, 220 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c index 84963a20c3e78..68a41f031bf72 100644 --- a/drivers/acpi/arm64/mpam.c +++ b/drivers/acpi/arm64/mpam.c @@ -95,17 +95,51 @@ static void acpi_mpam_parse_irqs(struct platform_device *pdev, res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); } -static int acpi_mpam_parse_resource(struct mpam_msc *msc, +#define UUID_MPAM_INTERCONNECT_TABLE "fe2bd645-033b-49e6-9479-2e0b8b21d1cd" + +struct acpi_mpam_interconnect_descriptor_table { + u8 type_uuid[16]; + u32 num_descriptors; +}; + +struct acpi_mpam_interconnect_descriptor { + u32 source_id; + u32 destination_id; + u8 link_type; + u8 reserved[3]; +}; + +static int acpi_mpam_parse_resource(struct acpi_mpam_msc_node *tbl_msc, + struct mpam_msc *msc, struct acpi_mpam_resource_node *res) { + struct acpi_mpam_interconnect_descriptor_table *tbl_int_tbl; + struct acpi_mpam_interconnect_descriptor *tbl_int; + guid_t int_tbl_uuid, spec_uuid; int level, nid; u32 cache_id; + off_t offset; + /* + * Class IDs are somewhat arbitrary, but need to be co-ordinated. + * 0-N are caches, + * 64, 65: Interconnect, but ideally these would appear between the + * classes the controls are adjacent to. + * 128: SMMU, + * 192-192+level: Memory Side Caches, nothing checks that N is a + * small number. + * 255: Memory Controllers + * + * ACPI devices would need a class id allocated based on the _HID. + * + * Classes that the mpam driver can't currently plumb into resctrl + * are registered as UNKNOWN. + */ switch (res->locator_type) { case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: cache_id = res->locator.cache_locator.cache_reference; level = find_acpi_cache_level_from_id(cache_id); - if (level <= 0) { + if (level <= 0 || level >= 64) { pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); return -EINVAL; } @@ -120,6 +154,57 @@ static int acpi_mpam_parse_resource(struct mpam_msc *msc, } return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, MPAM_CLASS_ID_DEFAULT, nid); + case ACPI_MPAM_LOCATION_TYPE_SMMU: + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 128, res->locator.smmu_locator.smmu_interface); + case ACPI_MPAM_LOCATION_TYPE_MEMORY_CACHE: + cache_id = res->locator.mem_cache_locator.reference; + level = res->locator.mem_cache_locator.level; + if (192 + level >= 255) { + pr_err_once("Bad level (%u) for memory side cache with reference %u\n", + level, cache_id); + return -EINVAL; + } + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + 192 + level, cache_id); + + case ACPI_MPAM_LOCATION_TYPE_INTERCONNECT: + /* Find the descriptor table, and check it lands in the parent msc */ + offset = res->locator.interconnect_ifc_locator.inter_connect_desc_tbl_off; + if (offset >= tbl_msc->length) { + pr_err_once("Bad offset (%lu) for interconnect descriptor on msc %u\n", + offset, tbl_msc->identifier); + return -EINVAL; + } + tbl_int_tbl = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor_table, + tbl_msc, offset); + guid_parse(UUID_MPAM_INTERCONNECT_TABLE, &spec_uuid); + import_guid(&int_tbl_uuid, tbl_int_tbl->type_uuid); + if (guid_equal(&spec_uuid, &int_tbl_uuid)) { + pr_err_once("Bad UUID for interconnect descriptor on msc %u\n", + tbl_msc->identifier); + return -EINVAL; + } + + offset += sizeof(*tbl_int_tbl); + offset += tbl_int_tbl->num_descriptors * sizeof(*tbl_int); + if (offset >= tbl_msc->length) { + pr_err_once("Bad num_descriptors (%u) for interconnect descriptor on msc %u\n", + tbl_int_tbl->num_descriptors, tbl_msc->identifier); + return -EINVAL; + } + + tbl_int = ACPI_ADD_PTR(struct acpi_mpam_interconnect_descriptor, + tbl_int_tbl, sizeof(*tbl_int_tbl)); + cache_id = tbl_int->source_id; + + /* Unknown link type? */ + if (tbl_int->link_type != 0 && tbl_int->link_type == 1) + return 0; + + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_UNKNOWN, + 64 + tbl_int->link_type, cache_id); default: /* These get discovered later and are treated as unknown */ return 0; @@ -150,7 +235,7 @@ int acpi_mpam_parse_resources(struct mpam_msc *msc, return -EINVAL; } - err = acpi_mpam_parse_resource(msc, resource); + err = acpi_mpam_parse_resource(tbl_msc, msc, resource); if (err) return err; diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig index 21671301bd8a4..145d837c190a3 100644 --- a/fs/resctrl/Kconfig +++ b/fs/resctrl/Kconfig @@ -37,3 +37,9 @@ config RESCTRL_RMID_DEPENDS_ON_CLOSID Enabled by the architecture when the RMID values depend on the CLOSID. This causes the CLOSID allocator to search for CLOSID with clean RMID. + +config RESCTRL_IOMMU + bool + help + Enabled by the architecture when some IOMMU are able to be configured + with CLOSID/RMID. diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 543584ba0e694..3982ce02ed548 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -769,10 +770,65 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, return ret; } +static int rdtgroup_move_iommu(int iommu_group_id, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + const struct cred *cred = current_cred(); + struct iommu_group *iommu_group; + int err; + + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID)) { + rdt_last_cmd_printf("No permission to move iommu_group %d\n", + iommu_group_id); + return -EPERM; + } + + iommu_group = iommu_group_get_by_id(iommu_group_id); + if (!iommu_group) { + rdt_last_cmd_printf("No matching iommu_group %d\n", + iommu_group_id); + return -ESRCH; + } + + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_iommu_closid(iommu_group, + rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move iommu_group to different control group\n"); + err = -EINVAL; + } else { + err = resctrl_arch_set_iommu_closid_rmid(iommu_group, + rdtgrp->closid, + rdtgrp->mon.rmid); + } + + iommu_group_put(iommu_group); + + return err; +} + +static bool string_is_iommu_group(char *buf, int *val) +{ + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return false; + + if (strlen(buf) <= strlen("iommu_group:")) + return false; + + if (strncmp(buf, "iommu_group:", strlen("iommu_group:"))) + return false; + + buf += strlen("iommu_group:"); + + return !kstrtoint(buf, 0, val); +} + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdtgroup *rdtgrp; + int iommu_group_id; + bool is_iommu; char *pid_str; int ret = 0; pid_t pid; @@ -794,7 +850,10 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, while (buf && buf[0] != '\0' && buf[0] != '\n') { pid_str = strim(strsep(&buf, ",")); - if (kstrtoint(pid_str, 0, &pid)) { + is_iommu = string_is_iommu_group(pid_str, &iommu_group_id); + if (is_iommu) + ret = rdtgroup_move_iommu(iommu_group_id, rdtgrp, of); + else if (kstrtoint(pid_str, 0, &pid)) { rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); ret = -EINVAL; break; @@ -819,6 +878,42 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static bool iommu_matches_rdtgroup(struct iommu_group *group, struct rdtgroup *r) +{ + if (r->type == RDTCTRL_GROUP) + return resctrl_arch_match_iommu_closid(group, r->closid); + + return resctrl_arch_match_iommu_closid_rmid(group, r->closid, + r->mon.rmid); +} + +static void show_rdt_iommu(struct rdtgroup *r, struct seq_file *s) +{ + struct kset *iommu_groups; + struct iommu_group *group; + struct kobject *group_kobj = NULL; + + if (!IS_ENABLED(CONFIG_RESCTRL_IOMMU) || + !static_branch_unlikely(&resctrl_abi_playground)) + return; + + iommu_groups = iommu_get_group_kset(); + + while ((group_kobj = kset_get_next_obj(iommu_groups, group_kobj))) { + /* iommu_group_get_from_kobj() wants to drop a reference */ + kobject_get(group_kobj); + + group = iommu_group_get_from_kobj(group_kobj); + if (!group) + continue; + + if (iommu_matches_rdtgroup(group, r)) + seq_printf(s, "iommu_group:%s\n", group_kobj->name); + } + + kset_put(iommu_groups); +} + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) { struct task_struct *p, *t; @@ -833,6 +928,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) } } rcu_read_unlock(); + + show_rdt_iommu(r, s); } static int rdtgroup_tasks_show(struct kernfs_open_file *of, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b236c4e9cb619..3ada7ed9f8bec 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -3,6 +3,7 @@ #define _RESCTRL_H #include +#include #include #include #include @@ -707,6 +708,7 @@ extern unsigned int resctrl_rmid_realloc_limit; int resctrl_init(void); void resctrl_exit(void); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK u64 resctrl_arch_get_prefetch_disable_bits(void); int resctrl_arch_pseudo_lock_fn(void *_plr); @@ -720,4 +722,30 @@ static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } #endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +/* When supported, the architecture must implement these */ +#ifdef CONFIG_RESCTRL_IOMMU +int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +bool resctrl_arch_match_iommu_closid(struct iommu_group *group, u32 closid); +bool resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, u32 closid, + u32 rmid); +#else +static inline int resctrl_arch_set_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return -EOPNOTSUPP; +} +static inline bool resctrl_arch_match_iommu_closid(struct iommu_group *group, + u32 closid) +{ + return false; +} +static inline bool +resctrl_arch_match_iommu_closid_rmid(struct iommu_group *group, + u32 closid, u32 rmid) +{ + return false; +} +#endif /* CONFIG_RESCTRL_IOMMU */ #endif /* _RESCTRL_H */ From aed9791a593ef512ee035bb3f19abf680fc3f894 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 21 May 2024 11:26:16 +0100 Subject: [PATCH 076/115] NVIDIA: VR: SAUCE: firmware: arm_scmi: add MPAM-FB SCMI protocol stub The Arm MPAM Firmware-backed (Fb) Profile describes an SCMI based protocol to access "Memory System Components" (MSCs) in an "Memory System Resource Partitioning And Monitoring" (MPAM) enabled system. Although this SCMI protocol follows the usual protocol properties, it will not be described in the SCMI specifications. Also since ACPI based systems will need to use this MPAM-fb profile, we do not follow the usual way of describing each protocol function as a function in the SCMI framework system. Instead there is one generic transport function, that takes a preformatted buffer and transfers this to the MSC agent. Signed-off-by: Andre Przywara Signed-off-by: James Morse (cherry picked from commit 262ebed25a96271fbeb1fc0b3128d68203202aca https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Update to new version APIs in scmi_mpam_protocol_init(); ] Signed-off-by: Fenghua Yu --- drivers/firmware/arm_scmi/Makefile | 2 +- drivers/firmware/arm_scmi/driver.c | 2 + drivers/firmware/arm_scmi/mpam.c | 60 +++++++++++++++++++++++++++ drivers/firmware/arm_scmi/protocols.h | 1 + drivers/resctrl/mpam_devices.c | 2 +- include/linux/scmi_protocol.h | 12 ++++++ 6 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 drivers/firmware/arm_scmi/mpam.c diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile index 780cd62b2f78a..caa61f16d12fc 100644 --- a/drivers/firmware/arm_scmi/Makefile +++ b/drivers/firmware/arm_scmi/Makefile @@ -8,7 +8,7 @@ scmi-driver-$(CONFIG_ARM_SCMI_RAW_MODE_SUPPORT) += raw_mode.o scmi-transport-$(CONFIG_ARM_SCMI_HAVE_SHMEM) = shmem.o scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o scmi-protocols-y := base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o -scmi-protocols-y += pinctrl.o +scmi-protocols-y += pinctrl.o mpam.o scmi-module-objs := $(scmi-driver-y) $(scmi-protocols-y) $(scmi-transport-y) obj-$(CONFIG_ARM_SCMI_PROTOCOL) += transports/ diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c index 3e76a3204ba4f..b1df87fbd4460 100644 --- a/drivers/firmware/arm_scmi/driver.c +++ b/drivers/firmware/arm_scmi/driver.c @@ -3498,6 +3498,7 @@ static int __init scmi_driver_init(void) scmi_system_register(); scmi_powercap_register(); scmi_pinctrl_register(); + scmi_mpam_register(); return platform_driver_register(&scmi_driver); } @@ -3516,6 +3517,7 @@ static void __exit scmi_driver_exit(void) scmi_system_unregister(); scmi_powercap_unregister(); scmi_pinctrl_unregister(); + scmi_mpam_unregister(); platform_driver_unregister(&scmi_driver); diff --git a/drivers/firmware/arm_scmi/mpam.c b/drivers/firmware/arm_scmi/mpam.c new file mode 100644 index 0000000000000..affb2ff7032c9 --- /dev/null +++ b/drivers/firmware/arm_scmi/mpam.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * System Control and Management Interface (SCMI) MPAM Protocol + * + * Copyright (C) 2024 ARM Ltd. + */ + +#include "common.h" +#include + +#define SCMI_PROTOCOL_SUPPORTED_VERSION 0x10000 + +static int scmi_mpam_transfer_buf(const struct scmi_protocol_handle *ph, + u8 msg_id, void *msg_buf, size_t msg_len, + u32 *ret_val) +{ + int ret; + struct scmi_xfer *t; + + ret = ph->xops->xfer_get_init(ph, msg_id, msg_len, + ret_val ? sizeof(*ret_val) : 0, &t); + if (ret) + return ret; + + memcpy(t->tx.buf, msg_buf, msg_len); + + ret = ph->xops->do_xfer(ph, t); + if (!ret && ret_val) { + u32 value; + + memcpy(&value, t->rx.buf, sizeof(value)); + *ret_val = le32_to_cpu((__le32)value); + } + + ph->xops->xfer_put(ph, t); + + return ret; +} + +static const struct scmi_mpam_proto_ops mpam_proto_ops = { + .mpam_transfer_buf = scmi_mpam_transfer_buf, +}; + +static int scmi_mpam_protocol_init(const struct scmi_protocol_handle *ph) +{ + dev_dbg(ph->dev, "SCMI MPAM Version %d.%d\n", + PROTOCOL_REV_MAJOR(ph->version), PROTOCOL_REV_MINOR(ph->version)); + + return 0; +} + +static const struct scmi_protocol scmi_mpam = { + .id = SCMI_PROTOCOL_MPAM, + .owner = THIS_MODULE, + .instance_init = &scmi_mpam_protocol_init, + .ops = &mpam_proto_ops, + .supported_version = SCMI_PROTOCOL_SUPPORTED_VERSION, +}; + +DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(mpam, scmi_mpam) diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h index f51245aca2594..aa1142ea1b4a0 100644 --- a/drivers/firmware/arm_scmi/protocols.h +++ b/drivers/firmware/arm_scmi/protocols.h @@ -380,5 +380,6 @@ DECLARE_SCMI_REGISTER_UNREGISTER(sensors); DECLARE_SCMI_REGISTER_UNREGISTER(voltage); DECLARE_SCMI_REGISTER_UNREGISTER(system); DECLARE_SCMI_REGISTER_UNREGISTER(powercap); +DECLARE_SCMI_REGISTER_UNREGISTER(mpam); #endif /* _SCMI_PROTOCOLS_H */ diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 791df02bdefa4..de109146e5e95 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2058,7 +2058,6 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) u32 tmp; char name[20]; struct mpam_msc *msc; - struct resource *msc_res; struct device *dev = &pdev->dev; lockdep_assert_held(&mpam_list_lock); @@ -2108,6 +2107,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (msc->iface == MPAM_IFACE_MMIO) { void __iomem *io; + struct resource *msc_res; io = devm_platform_get_and_ioremap_resource(pdev, 0, &msc_res); diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index aafaac1496b06..df75deeeed8b5 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -877,6 +877,17 @@ struct scmi_notify_ops { struct notifier_block *nb); }; +/** + * struct scmi_mpam_proto_ops - operations provided by SCMI MPAM Protocol + * + * @mpam_transfer_buf: transfer an SCMI MPAM message to the agent + */ +struct scmi_mpam_proto_ops { + int (*mpam_transfer_buf)(const struct scmi_protocol_handle *ph, + u8 msg_id, void *msg_buf, size_t msg_len, + u32 *ret_val); +}; + /** * struct scmi_handle - Handle returned to ARM SCMI clients for usage. * @@ -926,6 +937,7 @@ enum scmi_std_protocol { SCMI_PROTOCOL_VOLTAGE = 0x17, SCMI_PROTOCOL_POWERCAP = 0x18, SCMI_PROTOCOL_PINCTRL = 0x19, + SCMI_PROTOCOL_MPAM = 0x1a, }; enum scmi_system_events { From e4930378ebf93332e68b6aab27f2f4da29acda9c Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 23 Sep 2025 17:20:22 +0100 Subject: [PATCH 077/115] NVIDIA: VR: SAUCE: arm_mpam: add MPAM-FB MSC firmware access support The Arm MPAM Firmware-backed (Fb) Profile document[1] describes an alternative way of accessing the "Memory System Components" (MSC) in an MPAM enabled system. Normally the MSCs are MMIO mapped, but in some implementations this might not be possible (MSC located outside of the local socket, MSC mapped secure-only) or desirable (direct MMIO access too slow or needs to be mediated through a control processor). MPAM-fb standardises a protocol to abstract MSC accesses, building on the SCMI protocol. Add functions that do an MSC read or write access by redirecting the request through a firmware interface. This can either be through any supported SCMI transport, described via devicetree nodes, or via an ACPI PCC shared memory and mailbox combination. Signed-off-by: Andre Przywara Signed-off-by: James Morse (cherry picked from commit fe986189a0c6bd12557f1a9becb2b73dfe20153f https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/Makefile | 2 +- drivers/resctrl/mpam_devices.c | 64 +++++++++- drivers/resctrl/mpam_fb.c | 207 ++++++++++++++++++++++++++++++++ drivers/resctrl/mpam_fb.h | 26 ++++ drivers/resctrl/mpam_internal.h | 8 ++ include/linux/arm_mpam.h | 1 + 6 files changed, 301 insertions(+), 7 deletions(-) create mode 100644 drivers/resctrl/mpam_fb.c create mode 100644 drivers/resctrl/mpam_fb.h diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 4f6d0e81f9b8f..097c036724e97 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o -mpam-y += mpam_devices.o +mpam-y += mpam_devices.o mpam_fb.o mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index de109146e5e95..a2969b713dbf4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -30,7 +30,10 @@ #include #include +#include + #include "mpam_internal.h" +#include "mpam_fb.h" /* Values for the T241 errata workaround */ #define T241_CHIPS_MAX 4 @@ -182,6 +185,16 @@ static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) { WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + if (msc->iface == MPAM_IFACE_SCMI) { + u32 ret; + + mpam_fb_send_read_request(&msc->mpam_fb_chan, + msc->mpam_fb_msc_id, reg, &ret); + return ret; + } + + WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz); + return readl_relaxed(msc->mapped_hwpage + reg); } @@ -195,10 +208,15 @@ static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val) { - WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz); WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); - writel_relaxed(val, msc->mapped_hwpage + reg); + if (msc->iface == MPAM_IFACE_SCMI) { + mpam_fb_send_write_request(&msc->mpam_fb_chan, + msc->mpam_fb_msc_id, reg, val); + } else { + WARN_ON_ONCE(reg + sizeof(u32) >= msc->mapped_hwpage_sz); + writel_relaxed(val, msc->mapped_hwpage + reg); + } } static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val) @@ -2041,6 +2059,11 @@ static void mpam_msc_destroy(struct mpam_msc *msc) add_to_garbage(msc); } +static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) +{ + /* TODO: wake up tasks blocked on this MSC's PCC channel */ +} + static void mpam_msc_drv_remove(struct platform_device *pdev) { struct mpam_msc *msc = platform_get_drvdata(pdev); @@ -2055,9 +2078,9 @@ static void mpam_msc_drv_remove(struct platform_device *pdev) static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) { int err; - u32 tmp; char name[20]; struct mpam_msc *msc; + struct of_phandle_args of_args; struct device *dev = &pdev->dev; lockdep_assert_held(&mpam_list_lock); @@ -2100,10 +2123,16 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) if (err) return ERR_PTR(err); - if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) - msc->iface = MPAM_IFACE_MMIO; - else + if (!device_property_read_u32(&pdev->dev, "pcc-channel", + &msc->pcc_subspace_id)) { msc->iface = MPAM_IFACE_PCC; + } else if (!of_parse_phandle_with_fixed_args(pdev->dev.of_node, + "mpam-fb", 1, 0, + &of_args)) { + msc->iface = MPAM_IFACE_SCMI; + } else { + msc->iface = MPAM_IFACE_MMIO; + } if (msc->iface == MPAM_IFACE_MMIO) { void __iomem *io; @@ -2117,6 +2146,29 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) } msc->mapped_hwpage_sz = msc_res->end - msc_res->start; msc->mapped_hwpage = io; + } else if (msc->iface == MPAM_IFACE_PCC) { + msc->pcc_cl.dev = &pdev->dev; + msc->pcc_cl.rx_callback = mpam_pcc_rx_callback; + msc->pcc_cl.tx_block = false; + msc->pcc_cl.tx_tout = 1000; /* 1s */ + msc->pcc_cl.knows_txdone = false; + + msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl, + msc->pcc_subspace_id); + if (IS_ERR(msc->pcc_chan)) { + pr_err("Failed to request MSC PCC channel\n"); + return (void *)msc->pcc_chan; + } + } else if (msc->iface == MPAM_IFACE_SCMI) { + err = mpam_fb_connect_channel(of_args.np, + &msc->mpam_fb_chan); + if (err < 0) + return ERR_PTR(err); + + if (of_args.args_count > 0) + msc->mpam_fb_msc_id = of_args.args[0]; + else + msc->mpam_fb_msc_id = 0; } else { return ERR_PTR(-EINVAL); } diff --git a/drivers/resctrl/mpam_fb.c b/drivers/resctrl/mpam_fb.c new file mode 100644 index 0000000000000..af87a9e934cd0 --- /dev/null +++ b/drivers/resctrl/mpam_fb.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2024 Arm Ltd. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_fb.h" + +#define MPAM_MSC_ATTRIBUTES 0x3 +#define MPAM_MSC_READ 0x4 +#define MPAM_MSC_WRITE 0x5 + +static const struct scmi_mpam_proto_ops *mpam_scmi_ops; + +static DEFINE_MUTEX(scmi_agent_list_mutex); +static LIST_HEAD(smci_agent_list); + +struct scmi_mpam_agent { + struct list_head list; + struct device_node *of_node; + struct scmi_protocol_handle *ph_handle; +}; + +#define SCMI_BUF_LENGTH_IDX 4 +#define SCMI_BUF_HEADER_IDX 5 +#define SCMI_BUF_PAYLOAD_IDX 6 +#define SCMI_READ_MSG_SIZE 9 +#define SCMI_WRITE_MSG_SIZE 10 + +static int mpam_fb_build_read_message(int msc_id, int reg, u32 *msg_buf) +{ + memset(msg_buf, 0, SCMI_READ_MSG_SIZE * sizeof(u32)); + + msg_buf[SCMI_BUF_LENGTH_IDX] = SCMI_READ_MSG_SIZE * sizeof(u32); + msg_buf[SCMI_BUF_HEADER_IDX] = MPAM_MSC_READ | (0x1a << 10); + msg_buf[SCMI_BUF_PAYLOAD_IDX + 0] = msc_id; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 2] = reg; + + return SCMI_READ_MSG_SIZE * sizeof(u32); +} + +static int mpam_fb_build_write_message(int msc_id, int reg, u32 val, + u32 *msg_buf) +{ + memset(msg_buf, 0, SCMI_WRITE_MSG_SIZE * sizeof(u32)); + + msg_buf[SCMI_BUF_LENGTH_IDX] = SCMI_WRITE_MSG_SIZE * sizeof(u32); + msg_buf[SCMI_BUF_HEADER_IDX] = MPAM_MSC_WRITE | (0x1a << 10); + msg_buf[SCMI_BUF_PAYLOAD_IDX + 0] = msc_id; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 2] = reg; + msg_buf[SCMI_BUF_PAYLOAD_IDX + 3] = val; + + return SCMI_WRITE_MSG_SIZE * sizeof(u32); +} + +static struct scmi_protocol_handle *scmi_agent_get_ph(const struct device_node *np) +{ + struct scmi_mpam_agent *agent; + struct scmi_protocol_handle *ph = NULL; + + mutex_lock(&scmi_agent_list_mutex); + + list_for_each_entry(agent, &smci_agent_list, list) { + if (np == agent->of_node) { + ph = agent->ph_handle; + break; + } + } + + mutex_unlock(&scmi_agent_list_mutex); + + return ph; +} + +int mpam_fb_connect_channel(const struct device_node *of_node, + struct mpam_fb_channel *chan) +{ + int msc_id = 0; + + chan->ph_handle = scmi_agent_get_ph(of_node); + if (!chan->ph_handle) + return -EPROBE_DEFER; + + chan->use_scmi = true; + + return msc_id; +} + +int mpam_fb_send_read_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 *result) +{ + u32 msg_buf[12]; + int msg_len; + + msg_len = mpam_fb_build_read_message(msc_id, reg, msg_buf); + + if (chan->use_scmi) { + /* The SCMI layer adds the shared memory header itself. */ + msg_len -= SCMI_BUF_PAYLOAD_IDX * sizeof(u32); + + mpam_scmi_ops->mpam_transfer_buf(chan->ph_handle, + MPAM_MSC_READ, + msg_buf + SCMI_BUF_PAYLOAD_IDX, + msg_len, result); + + return 0; + } + + if (msg_len < chan->pcc_shmem_size) + return -EINVAL; + + memcpy(chan->pcc_shmem, msg_buf, msg_len); + mbox_send_message(chan->pcc_mbox, NULL); + + return 0; +} + +int mpam_fb_send_write_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 value) +{ + u32 msg_buf[12]; + int msg_len; + + msg_len = mpam_fb_build_write_message(msc_id, reg, value, msg_buf); + if (msg_len < 0) + return msg_len; + + if (chan->use_scmi) { + /* The SCMI layer adds the shared memory header itself. */ + msg_len -= SCMI_BUF_PAYLOAD_IDX * sizeof(u32); + + mpam_scmi_ops->mpam_transfer_buf(chan->ph_handle, + MPAM_MSC_WRITE, + msg_buf + SCMI_BUF_PAYLOAD_IDX, + msg_len, NULL); + + return 0; + } + + if (msg_len < chan->pcc_shmem_size) + return -EINVAL; + + memcpy(chan->pcc_shmem, msg_buf, msg_len); + mbox_send_message(chan->pcc_mbox, NULL); + + return 0; +} + +static int scmi_mpam_probe(struct scmi_device *sdev) +{ + const struct scmi_handle *handle = sdev->handle; + struct scmi_protocol_handle *ph; + struct scmi_mpam_agent *agent; + + if (!handle) + return -ENODEV; + + mpam_scmi_ops = handle->devm_protocol_get(sdev, SCMI_PROTOCOL_MPAM, &ph); + if (IS_ERR(mpam_scmi_ops)) + return PTR_ERR(mpam_scmi_ops); + + agent = devm_kzalloc(&sdev->dev, sizeof(*agent), GFP_KERNEL); + if (!agent) + return -ENOMEM; + + agent->of_node = sdev->dev.of_node; + agent->ph_handle = ph; + + mutex_lock(&scmi_agent_list_mutex); + list_add(&agent->list, &smci_agent_list); + mutex_unlock(&scmi_agent_list_mutex); + + return 0; +} + +static void scmi_mpam_remove(struct scmi_device *sdev) +{ +} + +static const struct scmi_device_id scmi_id_table[] = { + { SCMI_PROTOCOL_MPAM, "mpam" }, + {}, +}; +MODULE_DEVICE_TABLE(scmi, scmi_id_table); + +static struct scmi_driver scmi_mpam_driver = { + .name = "scmi-mpam-driver", + .probe = scmi_mpam_probe, + .remove = scmi_mpam_remove, + .id_table = scmi_id_table, +}; +module_scmi_driver(scmi_mpam_driver); diff --git a/drivers/resctrl/mpam_fb.h b/drivers/resctrl/mpam_fb.h new file mode 100644 index 0000000000000..723e9c5a5e1e3 --- /dev/null +++ b/drivers/resctrl/mpam_fb.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2024 Arm Ltd. + +#ifndef MPAM_FB_H_ +#define MPAM_FB_H_ + +#include +#include +#include + +struct mpam_fb_channel { + bool use_scmi; + struct scmi_protocol_handle *ph_handle; + void __iomem *pcc_shmem; + size_t pcc_shmem_size; + struct mbox_chan *pcc_mbox; +}; + +int mpam_fb_connect_channel(const struct device_node *of_node, + struct mpam_fb_channel *chan); +int mpam_fb_send_read_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 *result); +int mpam_fb_send_write_request(struct mpam_fb_channel *chan, int msc_id, + u16 reg, u32 value); + +#endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index d17c7512d807a..c9e2064ea49d7 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -17,9 +17,12 @@ #include #include #include +#include #include +#include "mpam_fb.h" + #define MPAM_MSC_MAX_NUM_RIS 16 struct platform_device; @@ -67,6 +70,11 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; + u32 pcc_subspace_id; + struct mbox_client pcc_cl; + struct pcc_mbox_chan *pcc_chan; + struct mpam_fb_channel mpam_fb_chan; + int mpam_fb_msc_id; /* in its own name space */ u32 nrdy_usec; u64 nrdy_retry_count; cpumask_t accessibility; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 4ccf32fe07fd5..77b62491c448c 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -14,6 +14,7 @@ struct mpam_msc; enum mpam_msc_iface { MPAM_IFACE_MMIO, /* a real MPAM MSC */ MPAM_IFACE_PCC, /* a fake MPAM MSC */ + MPAM_IFACE_SCMI, /* through a firmware interface */ }; enum mpam_class_types { From 42757601a02da1db48ab91a82107b1c79351c79c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 28 Jun 2024 14:04:53 +0100 Subject: [PATCH 078/115] NVIDIA: VR: SAUCE: arm_mpam: Allow duplicate PCC subspace_ids Carl reports that some platforms use the same PCC channel for multiple MSCs, which leads to the driver not probing. Add a list that is searched each time a new channel is allocated. CC: Carl Worth Signed-off-by: James Morse (cherry picked from commit a9dade368ac57491282358d688c6e52d8ca9cfaa https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 110 +++++++++++++++++++++++++++++--- drivers/resctrl/mpam_internal.h | 3 +- 2 files changed, 103 insertions(+), 10 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index a2969b713dbf4..175e769fa1f4b 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -171,6 +171,102 @@ static void mpam_free_garbage(void) } } +static LIST_HEAD(mpam_pcc_channels); + +struct mpam_pcc_chan { + struct list_head pcc_channels_list; + + u32 refs; + u32 subspace_id; + struct pcc_mbox_chan *channel; + struct mbox_client pcc_cl; + + struct mpam_garbage garbage; +}; + +static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +{ + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + chan = kzalloc(sizeof(*chan), gfp); + if (!chan) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&chan->pcc_channels_list); + chan->refs = 1; + chan->subspace_id = subspace_id; + /* + * TODO is the device important - these subspace_id can be re-used, so + * there is no one device to put here ... + */ + chan->pcc_cl.rx_callback = mpam_pcc_rx_callback; + chan->pcc_cl.tx_block = false; + chan->pcc_cl.tx_tout = 1000; /* 1s */ + chan->pcc_cl.knows_txdone = false; + + chan->channel = pcc_mbox_request_channel(&chan->pcc_cl, subspace_id); + if (IS_ERR(chan->channel)) { + kfree(chan); + return NULL; + } + + init_garbage(&chan->garbage); + list_add(&chan->pcc_channels_list, &mpam_pcc_channels); + return chan->channel; +} + +static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) +{ + bool found = false; + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { + if (chan->subspace_id == subspace_id) { + found = true; + break; + } + } + + if (found) { + chan->refs++; + return chan->channel; + } + + if (!alloc) + return ERR_PTR(-ENOENT); + + return mpam_pcc_alloc(subspace_id, gfp); +} + +static void mpam_pcc_put(u8 subspace_id) +{ + bool found = false; + struct mpam_pcc_chan *chan; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { + if (chan->subspace_id == subspace_id) { + found = true; + break; + } + } + + if (!found) + return; + + chan->refs--; + if (!chan->refs) { + list_del(&chan->pcc_channels_list); + pcc_mbox_free_channel(chan->channel); + add_to_garbage(chan); + } +} + /* * Once mpam is enabled, new requestors cannot further reduce the available * partid. Assert that the size is fixed, and new requestors will be turned @@ -2056,10 +2152,13 @@ static void mpam_msc_destroy(struct mpam_msc *msc) debugfs_remove_recursive(msc->debugfs); msc->debugfs = NULL; + if (msc->iface == MPAM_IFACE_PCC) + mpam_pcc_put(msc->pcc_subspace_id); + add_to_garbage(msc); } -static void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) +void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg) { /* TODO: wake up tasks blocked on this MSC's PCC channel */ } @@ -2147,14 +2246,7 @@ static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) msc->mapped_hwpage_sz = msc_res->end - msc_res->start; msc->mapped_hwpage = io; } else if (msc->iface == MPAM_IFACE_PCC) { - msc->pcc_cl.dev = &pdev->dev; - msc->pcc_cl.rx_callback = mpam_pcc_rx_callback; - msc->pcc_cl.tx_block = false; - msc->pcc_cl.tx_tout = 1000; /* 1s */ - msc->pcc_cl.knows_txdone = false; - - msc->pcc_chan = pcc_mbox_request_channel(&msc->pcc_cl, - msc->pcc_subspace_id); + msc->pcc_chan = mpam_pcc_get(msc->pcc_subspace_id, true, GFP_KERNEL); if (IS_ERR(msc->pcc_chan)) { pr_err("Failed to request MSC PCC channel\n"); return (void *)msc->pcc_chan; diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index c9e2064ea49d7..0f35bf33c479f 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -71,7 +71,6 @@ struct mpam_msc { /* Not modified after mpam_is_enabled() becomes true */ enum mpam_msc_iface iface; u32 pcc_subspace_id; - struct mbox_client pcc_cl; struct pcc_mbox_chan *pcc_chan; struct mpam_fb_channel mpam_fb_chan; int mpam_fb_msc_id; /* in its own name space */ @@ -534,6 +533,8 @@ int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, bool mpam_force_unknown_msc_test(struct mpam_msc *msc); +void mpam_pcc_rx_callback(struct mbox_client *cl, void *msg); + #ifdef CONFIG_RESCTRL_FS int mpam_resctrl_setup(void); void mpam_resctrl_exit(void); From 74270b20e5e3e385664f79e10304a63859f50c61 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 2 Jul 2025 17:20:39 +0100 Subject: [PATCH 079/115] NVIDIA: VR: SAUCE: untested: mpam: Convert pcc_channels list to XArray and cleanup Squash this into the previous patch once it has been tested... ... does anyone have a PCC platform that can take this for a spin? Signed-off-by: James Morse (cherry picked from commit c83dd7f2625cde3d56af12e388d0bebfcfeec0f4 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 51 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 175e769fa1f4b..6516cf400ca97 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -171,11 +171,9 @@ static void mpam_free_garbage(void) } } -static LIST_HEAD(mpam_pcc_channels); +static DEFINE_XARRAY(mpam_pcc_channels); struct mpam_pcc_chan { - struct list_head pcc_channels_list; - u32 refs; u32 subspace_id; struct pcc_mbox_chan *channel; @@ -184,17 +182,15 @@ struct mpam_pcc_chan { struct mpam_garbage garbage; }; -static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +static struct mpam_pcc_chan *__mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) { - struct mpam_pcc_chan *chan; + struct mpam_pcc_chan *chan __free(kfree) = kzalloc(sizeof(*chan), gfp); lockdep_assert_held(&mpam_list_lock); - chan = kzalloc(sizeof(*chan), gfp); if (!chan) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD_RCU(&chan->pcc_channels_list); chan->refs = 1; chan->subspace_id = subspace_id; /* @@ -207,31 +203,29 @@ static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) chan->pcc_cl.knows_txdone = false; chan->channel = pcc_mbox_request_channel(&chan->pcc_cl, subspace_id); - if (IS_ERR(chan->channel)) { - kfree(chan); - return NULL; - } + if (IS_ERR(chan->channel)) + return ERR_CAST(chan->channel); init_garbage(&chan->garbage); - list_add(&chan->pcc_channels_list, &mpam_pcc_channels); - return chan->channel; + xa_store(&mpam_pcc_channels, subspace_id, chan, gfp); + + return_ptr(chan); +} + +static struct pcc_mbox_chan *mpam_pcc_alloc(u8 subspace_id, gfp_t gfp) +{ + struct mpam_pcc_chan *chan = __mpam_pcc_alloc(subspace_id, gfp); + return IS_ERR(chan) ? ERR_CAST(chan) : chan->channel; } static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) { - bool found = false; struct mpam_pcc_chan *chan; lockdep_assert_held(&mpam_list_lock); - list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { - if (chan->subspace_id == subspace_id) { - found = true; - break; - } - } - - if (found) { + chan = xa_load(&mpam_pcc_channels, subspace_id); + if (chan) { chan->refs++; return chan->channel; } @@ -244,24 +238,17 @@ static struct pcc_mbox_chan *mpam_pcc_get(u8 subspace_id, bool alloc, gfp_t gfp) static void mpam_pcc_put(u8 subspace_id) { - bool found = false; struct mpam_pcc_chan *chan; lockdep_assert_held(&mpam_list_lock); - list_for_each_entry(chan, &mpam_pcc_channels, pcc_channels_list) { - if (chan->subspace_id == subspace_id) { - found = true; - break; - } - } - - if (!found) + chan = xa_load(&mpam_pcc_channels, subspace_id); + if (!chan) return; chan->refs--; if (!chan->refs) { - list_del(&chan->pcc_channels_list); + xa_erase(&mpam_pcc_channels, subspace_id); pcc_mbox_free_channel(chan->channel); add_to_garbage(chan); } From 20f1f56ee8bfe90bb6218be4209d8a7b978f3459 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 16:46:12 +0000 Subject: [PATCH 080/115] NVIDIA: VR: SAUCE: x86/resctrl: Add stub to allow other architecture to disable monitor overflow Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow other architectures to disable this, add a helper that reports whether counters can overflow. Signed-off-by: James Morse (cherry picked from commit 6a4360b3e0339ffc510b68d7a7d22941030f0604 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- arch/x86/include/asm/resctrl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 575f8408a9e7c..40a74a0617345 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -191,6 +191,11 @@ static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx) { } +static inline bool resctrl_arch_mon_can_overflow(void) +{ + return true; +} + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else From 66c73d6637344e60650aea26c6dd870a8633423f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:32:53 +0000 Subject: [PATCH 081/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Determine if any exposed counter can overflow Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. To allow the overflow handler to be disabled, determine if an overflow can happen. If a class is not implemented, or has the 63bit counter, it can't overflow. Signed-off-by: James Morse (cherry picked from commit 0f6aefdf5164dd6be3bd8c6cd82b6257fadbeab2 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 21 +++++++++++++++++++++ include/linux/arm_mpam.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 446b8496b9341..d28838ec0c3bb 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -535,6 +535,27 @@ static int update_rmid_limits(struct mpam_class *class) return 0; } +static bool __resctrl_arch_mon_can_overflow(enum resctrl_event_id eventid) +{ + struct mpam_props *cprops; + struct mpam_class *class = mpam_resctrl_counters[eventid].class; + + if (!class) + return false; + + /* No need to worry about a 63 bit counter overflowing */ + cprops = &class->props; + return !mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops); +} + +bool resctrl_arch_mon_can_overflow(void) +{ + if (__resctrl_arch_mon_can_overflow(QOS_L3_MBM_LOCAL_EVENT_ID)) + return true; + + return __resctrl_arch_mon_can_overflow(QOS_L3_MBM_TOTAL_EVENT_ID); +} + static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, enum mpam_device_features mon_type, diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 77b62491c448c..98b29660f7dc0 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -54,6 +54,7 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, bool resctrl_arch_alloc_capable(void); bool resctrl_arch_mon_capable(void); +bool resctrl_arch_mon_can_overflow(void); void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); From 27a33ff1a85572be9c6d307706ada5ec3730929a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 15 Mar 2024 17:36:02 +0000 Subject: [PATCH 082/115] NVIDIA: VR: SAUCE: fs/restrl: Allow the overflow handler to be disabled Resctrl has an overflow handler that runs on each domain every second to ensure that any overflow of the hardware counter is accounted for. MPAM can have counters as large as 63 bits, in which case there is no need to check for overflow. Call the new arch helpers to determine this. Signed-off-by: James Morse (cherry picked from commit 72e375a4611a0eb5355e5a171a67a419ffd53522 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/monitor.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e338b8d484054..94da0360952d5 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -918,8 +918,10 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del /* * When a domain comes online there is no guarantee the filesystem is * mounted. If not, there is no need to catch counter overflow. + * Some architecture may have ~64bit counters, and can ignore overflow. */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) + if (!resctrl_mounted || !resctrl_arch_mon_capable() || + !resctrl_arch_mon_can_overflow()) return; cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; From 573d10fceb283cb5df3cb5ccc6c9fac9f56c1a8f Mon Sep 17 00:00:00 2001 From: Rex Nie Date: Mon, 11 Mar 2024 16:18:39 +0800 Subject: [PATCH 083/115] NVIDIA: VR: SAUCE: fs/resctrl: Uniform data type of component_id/domid/id/cache_id This patch uniform data type of component_id/domid/id/cache_id to u32 to avoid type confusion. According to ACPI for mpam, cache id is used as locator for cache MSC. Reference to RD_PPTT_CACHE_ID definition from edk2-platforms, u32 is enough for cache_id. ( \ (((PackageId) & 0xF) << 20) | (((ClusterId) & 0xFF) << 12) | \ (((CoreId) & 0xFF) << 4) | ((CacheType) & 0xF) \ ) refs: 1. ACPI for mpam: https://developer.arm.com/documentation/den0065/latest/ 2. RD_PPTT_CACHE_ID from edk2-platforms: https://github.com/tianocore/edk2-platforms/blob/master/Platform/ARM/SgiPkg/Include/SgiAcpiHeader.h#L202 Signed-off-by: Rex Nie Signed-off-by: James Morse (cherry picked from commit 6941241fa2fd78befa42cd442507157701c98878 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/arm_mpam.h`; - Resolve minor conflicts in `include/linux/resctrl.h`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 8 ++++---- include/linux/arm_mpam.h | 4 ++-- include/linux/resctrl.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6516cf400ca97..96e4761d5c104 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -466,7 +466,7 @@ mpam_class_find(u8 level_idx, enum mpam_class_types type) } static struct mpam_component * -mpam_component_alloc(struct mpam_class *class, int id) +mpam_component_alloc(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -508,7 +508,7 @@ static void mpam_component_destroy(struct mpam_component *comp) } static struct mpam_component * -mpam_component_find(struct mpam_class *class, int id) +mpam_component_find(struct mpam_class *class, u32 id) { struct mpam_component *comp; @@ -634,7 +634,7 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, - int component_id) + u32 component_id) { int err; struct mpam_vmsc *vmsc; @@ -722,7 +722,7 @@ static void mpam_ris_destroy(struct mpam_msc_ris *ris) } int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id) + enum mpam_class_types type, u8 class_id, u32 component_id) { int err; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 98b29660f7dc0..3d96aa514d6bb 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -42,11 +42,11 @@ static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #ifdef CONFIG_ARM64_MPAM_DRIVER int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, - enum mpam_class_types type, u8 class_id, int component_id); + enum mpam_class_types type, u8 class_id, u32 component_id); #else static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, - int component_id) + u32 component_id) { return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3ada7ed9f8bec..c4448cec07e41 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -138,7 +138,7 @@ enum resctrl_domain_type { */ struct rdt_domain_hdr { struct list_head list; - int id; + u32 id; enum resctrl_domain_type type; enum resctrl_res_level rid; struct cpumask cpu_mask; From 7161cdb24aad8575f0cbf6a4676914a689876123 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 27 Aug 2024 15:24:08 +0100 Subject: [PATCH 084/115] NVIDIA: VR: SAUCE: arm_mpam: Allow cmax/cmin to be configured mpam_reprogram_ris_partid() always resets the CMAX/CMIN controls to their 'unrestricted' value. This prevents the controls from being configured. Add fields in struct mpam_config, and program these values when they are set in the features bitmask. Signed-off-by: James Morse (cherry picked from commit e701b2860ae2c02dc9c2015846d61838904a5b0b https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 23 +++++++++++++++++++---- drivers/resctrl/mpam_internal.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 96e4761d5c104..773b5ed4d0113 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1730,11 +1730,25 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) - mpam_write_partsel_reg(msc, CMAX, cmax); + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax_val = cfg->cmax; - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) - mpam_write_partsel_reg(msc, CMIN, 0); + if (cfg->cmax_softlim) + cmax_val |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax_val); + } else { + mpam_write_partsel_reg(msc, CMAX, cmax); + } + } + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { + if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); + } else { + mpam_write_partsel_reg(msc, CMIN, 0); + } + } if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -3200,6 +3214,7 @@ static bool mpam_update_config(struct mpam_config *cfg, bool has_changes = false; maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_cmax_cmax, newcfg, cmax, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 0f35bf33c479f..3bc91db279381 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -362,6 +362,10 @@ struct mpam_config { u32 cpbm; u32 mbw_pbm; u16 mbw_max; + u16 cmax; + u16 cmin; + + bool cmax_softlim; struct mpam_garbage garbage; }; From de7a05a2774d6f0d2475fc7e101a750d240a1284 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:37:26 +0000 Subject: [PATCH 085/115] NVIDIA: VR: SAUCE: arm_mpam: Rename mbw conversion to 'fract16' for code re-use Functions like mbw_max_to_percent() convert a value into MPAMs 16 bit fixed point fraction format. These are not only used for memory bandwidth, but cache capcity controls too. Rename these functions to convert to/from a 'fract16', and add helpers for the specific mbw_max/cmax controls. Signed-off-by: James Morse (cherry picked from commit 738f1605fb5c796713a429214270a18ec9c5d6c3 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; - Resolve minor conflicts in `drivers/resctrl/test_mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 24 +++++++++++++++++------- drivers/resctrl/test_mpam_resctrl.c | 4 ++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index d28838ec0c3bb..1b60b48bee27d 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -854,14 +854,14 @@ static u32 get_mba_granularity(struct mpam_props *cprops) * * Find the nearest percentage value to the upper bound of the selected band: */ -static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = mbw_max; + u32 val = fract; - val >>= 16 - cprops->bwa_wd; + val >>= 16 - wd; val += 1; val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + val = DIV_ROUND_CLOSEST(val, 1 << wd); return val; } @@ -876,18 +876,28 @@ static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) * percentages) and over-commit (where the total of the converted * allocations is greater than expected). */ -static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +static u16 percent_to_fract16(u8 pc, u8 wd) { u32 val = pc; - val <<= cprops->bwa_wd; + val <<= wd; val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); val = max(val, 1) - 1; - val <<= 16 - cprops->bwa_wd; + val <<= 16 - wd; return val; } +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + return fract16_to_percent(mbw_max, cprops->bwa_wd); +} + +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->bwa_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) { diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c index 4145f057bd31a..2dd28336b3d1d 100644 --- a/drivers/resctrl/test_mpam_resctrl.c +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -133,7 +133,7 @@ static void test_get_mba_granularity(struct kunit *test) KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ } -static void test_mbw_max_to_percent(struct kunit *test) +static void test_fract16_to_percent(struct kunit *test) { const struct percent_value_case *param = test->param_value; struct percent_value_test_info res; @@ -359,7 +359,7 @@ static void test_num_assignable_counters(struct kunit *test) static struct kunit_case mpam_resctrl_test_cases[] = { KUNIT_CASE(test_get_mba_granularity), - KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_fract16_to_percent, test_percent_value_gen_params), KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), KUNIT_CASE(test_percent_to_max_rounding), From b5c53160da5f05f9c6d0afcd6a2b06ecc8030654 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 18 Nov 2024 18:45:50 +0000 Subject: [PATCH 086/115] NVIDIA: VR: SAUCE: fs/resctrl: Group all the MBA specific properties in a separate struct struct resctrl_membw combines parameters that are related to the control value, and parameters that are specific to the MBA resource. To allow the control value parsing and management code to be re-used for other resources, it needs to be separated from the MBA resource. Add struct resctrl_mba that holds all the parameters that are specific to the MBA resource. Signed-off-by: James Morse (cherry picked from commit c1133462aa498d8b75e73b094eb91512d982e067 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 18 +++++++++--------- drivers/resctrl/mpam_resctrl.c | 4 ++-- fs/resctrl/ctrlmondata.c | 3 ++- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 26 +++++++++++++++++--------- 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7667cf7c4e945..ba3316a41141b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -212,21 +212,21 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; r->membw.max_bw = MAX_MBA_BW; - r->membw.arch_needs_linear = true; + r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { - r->membw.delay_linear = true; + r->mba.delay_linear = true; r->membw.min_bw = MAX_MBA_BW - max_delay; r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; - r->membw.arch_needs_linear = false; + r->mba.arch_needs_linear = false; } if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) - r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + r->mba.throttle_mode = THREAD_THROTTLE_PER_THREAD; else - r->membw.throttle_mode = THREAD_THROTTLE_MAX; + r->mba.throttle_mode = THREAD_THROTTLE_MAX; r->alloc_capable = true; @@ -249,14 +249,14 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.max_bw = 1 << eax; /* AMD does not use delay */ - r->membw.delay_linear = false; - r->membw.arch_needs_linear = false; + r->mba.delay_linear = false; + r->mba.arch_needs_linear = false; /* * AMD does not use memory delay throttle model to control * the allocation like Intel does. */ - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; @@ -325,7 +325,7 @@ static void mba_wrmsr_amd(struct msr_param *m) */ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { - if (r->membw.delay_linear) + if (r->mba.delay_linear) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 1b60b48bee27d..905aac8b0a562 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1418,8 +1418,8 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->schema_fmt = RESCTRL_SCHEMA_RANGE; r->ctrl_scope = RESCTRL_L3_CACHE; - r->membw.delay_linear = true; - r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->mba.delay_linear = true; + r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0c02451c687b2..1eac8f7dc07ac 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -48,7 +48,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 3982ce02ed548..ebc409d98f0f5 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1297,7 +1297,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.delay_linear); + seq_printf(seq, "%u\n", r->mba.delay_linear); return 0; } @@ -1315,7 +1315,7 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - switch (r->membw.throttle_mode) { + switch (r->mba.throttle_mode) { case THREAD_THROTTLE_PER_THREAD: seq_puts(seq, "per-thread\n"); return 0; @@ -1652,7 +1652,7 @@ bool is_mba_sc(struct rdt_resource *r) if (r->rid != RDT_RESOURCE_MBA) return false; - return r->membw.mba_sc; + return r->mba.mba_sc; } /* @@ -2274,13 +2274,13 @@ static void thread_throttle_mode_init(void) r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; + r_mba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->mba.throttle_mode; r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; + r_smba->mba.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->mba.throttle_mode; if (throttle_mode == THREAD_THROTTLE_UNDEFINED) return; @@ -2582,7 +2582,7 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, static inline bool is_mba_linear(void) { - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->mba.delay_linear; } static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -2640,7 +2640,7 @@ static int set_mba_sc(bool mba_sc) if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; - r->membw.mba_sc = mba_sc; + r->mba.mba_sc = mba_sc; rdtgroup_default.mba_mbps_event = mba_mbps_default_event; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index c4448cec07e41..f0a33e4762201 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -248,22 +248,28 @@ enum membw_throttle_mode { * @min_bw: Minimum memory bandwidth percentage user can request * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @arch_needs_linear: True if we can't configure non-linear resources - * @throttle_mode: Bandwidth throttling mode when threads request - * different memory bandwidths - * @mba_sc: True if MBA software controller(mba_sc) is enabled - * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct resctrl_membw { u32 min_bw; u32 max_bw; u32 bw_gran; - u32 delay_linear; - bool arch_needs_linear; - enum membw_throttle_mode throttle_mode; +}; + +/** + * struct resctrl_mba - Resource properties that are specific to the MBA resource + * @mba_sc: True if MBA software controller(mba_sc) is enabled + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + * @delay_linear: True if control is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Mode when threads request different control values + */ +struct resctrl_mba { bool mba_sc; u32 *mb_map; + bool delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + }; struct resctrl_schema; @@ -315,6 +321,7 @@ struct resctrl_mon { * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource + * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. * @cdp_capable: Is the CDP feature available on this resource @@ -328,6 +335,7 @@ struct rdt_resource { struct resctrl_cache cache; struct resctrl_membw membw; struct resctrl_mon mon; + struct resctrl_mba mba; struct list_head ctrl_domains; struct list_head mon_domains; char *name; From 9929d500742be4770fde59d52c30b81a72534622 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 11:33:53 +0100 Subject: [PATCH 087/115] NVIDIA: VR: SAUCE: fs/resctrl: Abstract duplicate domain test to a helper parse_cbm() and parse_bw() both test the staged config for an existing entry. These would indicate user-space has provided a schema with a duplicate domain entry. e.g: | L3:0=ffff;1=f00f;0=f00f If new parsers are added this duplicate domain test has to be duplicated. Move it to the caller. Signed-off-by: James Morse (cherry picked from commit 827c80b5ec1b14a0f3d77e12ad13a8fbbf499ccd https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/ctrlmondata.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1eac8f7dc07ac..48ebc0f5bafbb 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -84,12 +84,6 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 bw_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; @@ -98,6 +92,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, return 0; } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -165,12 +160,6 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 cbm_val; - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - /* * Cannot set up more than one pseudo-locked region in a cache * hierarchy. @@ -207,6 +196,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, } } + cfg = &d->staged_config[s->conf_type]; cfg->new_ctrl = cbm_val; cfg->have_new_ctrl = true; @@ -264,13 +254,18 @@ static int parse_line(char *line, struct resctrl_schema *s, dom = strim(dom); list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { + cfg = &d->staged_config[t]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + data.buf = dom; data.closid = rdtgrp->closid; data.mode = rdtgrp->mode; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; /* * In pseudo-locking setup mode and just * parsed a valid CBM that should be From 5531a97de851b740731551b0c467434da4642ecf Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:02:03 +0000 Subject: [PATCH 088/115] NVIDIA: VR: SAUCE: fs/resctrl: Move MBA supported check to parse_line() instead of parse_bw() MBA is only supported on platforms where the delay inserted by the control is linear. Resctrl checks the two properties provided by the arch code match each time it parses part of a new control value. This doesn't need to be done so frequently, and obscures changes to parse_bw() to abstract it for use with other control types. Move this check to the parse_line() caller so it only happens once. Signed-off-by: James Morse (cherry picked from commit 85be43b4b1214a6f88d5643a8973ec6808cec56c https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 48ebc0f5bafbb..ec9ea0f607191 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -45,15 +45,6 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) int ret; u32 bw; - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (r->rid == RDT_RESOURCE_MBA && - !r->mba.delay_linear && r->mba.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - ret = kstrtou32(buf, 10, &bw); if (ret) { rdt_last_cmd_printf("Invalid MB value %s\n", buf); @@ -242,6 +233,15 @@ static int parse_line(char *line, struct resctrl_schema *s, return -EINVAL; } + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (r->rid == RDT_RESOURCE_MBA && + !r->mba.delay_linear && r->mba.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; From 7ce0bf1ccbe8a264c433b8c177a3e1a9be9faaca Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 15:55:45 +0000 Subject: [PATCH 089/115] NVIDIA: VR: SAUCE: fs/resctrl: Rename resctrl_get_default_ctrl() to include resource resctrl_get_default_ctrl() is called by both the architecture code and filesystem code to return the default value for a control. This depends on the schema format. parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Rename resctrl_get_default_ctrl() to make it clear it returns the resource default. Signed-off-by: James Morse (cherry picked from commit a4ba73c6546aaf2eb6805ad910b27c55663843e0 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- drivers/resctrl/mpam_resctrl.c | 10 +++++----- fs/resctrl/rdtgroup.c | 4 ++-- include/linux/resctrl.h | 13 ++++++++----- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index ba3316a41141b..244f0d2a93e7a 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -378,7 +378,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = resctrl_get_default_ctrl(r); + *dc = resctrl_get_resource_default_ctrl(r); } static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 8850264684405..8a017f1111028 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -253,7 +253,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); + hw_dom->ctrl_val[i] = resctrl_get_resource_default_ctrl(r); msr_param.dom = d; smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 905aac8b0a562..4e62aeb4334a7 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1411,7 +1411,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) * we have configured the SMMU and GIC not to do this 'all the * bits' is the correct answer here. */ - r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); r->alloc_capable = true; break; case RDT_RESOURCE_MBA: @@ -1564,7 +1564,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, lockdep_assert_cpus_held(); if (!mpam_is_enabled()) - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); res = container_of(r, struct mpam_resctrl_res, resctrl_res); dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); @@ -1593,12 +1593,12 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, } fallthrough; default: - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); } if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || !mpam_has_feature(configured_by, cfg)) - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); switch (configured_by) { case mpam_feat_cpor_part: @@ -1606,7 +1606,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case mpam_feat_mbw_max: return mbw_max_to_percent(cfg->mbw_max, cprops); default: - return resctrl_get_default_ctrl(r); + return resctrl_get_resource_default_ctrl(r); } } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ebc409d98f0f5..f4dc1371271d7 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1106,7 +1106,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); return 0; } @@ -3785,7 +3785,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->new_ctrl = resctrl_get_resource_default_ctrl(r); cfg->have_new_ctrl = true; } } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f0a33e4762201..d6f68f7e65964 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -323,7 +323,10 @@ struct resctrl_mon { * @mon_domains: RCU list of all monitor domains for this resource * @mba: Properties of the MBA resource * @name: Name to use in "schemata" file. - * @schema_fmt: Which format string and parser is used for this schema. + * @schema_fmt: Which format control parameters should be in for this resource. + * @evt_list: List of monitoring events + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -402,11 +405,11 @@ struct resctrl_mon_config_info { void resctrl_arch_sync_cpu_closid_rmid(void *info); /** - * resctrl_get_default_ctrl() - Return the default control value for this - * resource. - * @r: The resource whose default control type is queried. + * resctrl_get_resource_default_ctrl() - Return the default control value for + * this resource. + * @r: The resource whose default control value is queried. */ -static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) { switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: From f59372d1dfff83a3bbf18e178f55dc3e058a275c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 12:21:25 +0000 Subject: [PATCH 090/115] NVIDIA: VR: SAUCE: fs/resctrl: Add a schema format to the schema, allowing it to be different parse_bw() doesn't bother checking the bounds it is given if the resource is in use by mba_sc. This is because the values parsed from user-space are not the same as those the control should take. To make this disparity easier to work with, a second different copy of the schema format is needed, which would need a version of resctrl_get_default_ctrl(). This would let the resctrl change the schema format presented to user-space, provided it converts it to match what the architecture code expects. Add a second schema format for use with mba_sc. The membw properties are copied and the schema version is used. When mba_sc is enabled the schema copy of these properties is modified. Signed-off-by: James Morse (cherry picked from commit 225d28eb849877c6b97dcdc466d8e1aa67978272 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/ctrlmondata.c`; - Resolve minor conflicts in `include/linux/arm_mpam.h`; ] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/ctrlmondata.c | 14 ++++++------ fs/resctrl/rdtgroup.c | 26 +++++++++++++++++------ include/linux/arm_mpam.h | 4 +--- include/linux/resctrl.h | 24 ++++++++++++++++++++- 5 files changed, 52 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index d539e56c2b1f0..91ce05256a004 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -20,9 +20,9 @@ #include "internal.h" -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r) +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s) { - return roundup(val, (unsigned long)r->membw.bw_gran); + return roundup(val, (unsigned long)s->membw.bw_gran); } int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ec9ea0f607191..1e51c4a01e785 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -40,7 +40,7 @@ typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct resctrl_schema *s) { int ret; u32 bw; @@ -52,18 +52,18 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) } /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { + if (is_mba_sc(s->res)) { *data = bw; return true; } - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + if (bw < s->membw.min_bw || bw > s->membw.max_bw) { rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); + bw, s->membw.min_bw, s->membw.max_bw); return false; } - *data = resctrl_arch_round_bw(bw, r); + *data = resctrl_arch_round_bw(bw, s); return true; } @@ -75,7 +75,7 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, u32 closid = data->closid; u32 bw_val; - if (!bw_validate(data->buf, &bw_val, r)) + if (!bw_validate(data->buf, &bw_val, s)) return -EINVAL; if (is_mba_sc(r)) { @@ -215,7 +215,7 @@ static int parse_line(char *line, struct resctrl_schema *s, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - switch (r->schema_fmt) { + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f4dc1371271d7..92045c18a1268 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1104,9 +1104,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", resctrl_get_resource_default_ctrl(r)); + seq_printf(seq, "%x\n", resctrl_get_schema_default_ctrl(s)); return 0; } @@ -1247,9 +1246,8 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.min_bw); + seq_printf(seq, "%u\n", s->membw.min_bw); return 0; } @@ -1285,9 +1283,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - seq_printf(seq, "%u\n", r->membw.bw_gran); + seq_printf(seq, "%u\n", s->membw.bw_gran); return 0; } @@ -2829,7 +2826,22 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type if (cl > max_name_width) max_name_width = cl; - switch (r->schema_fmt) { + s->schema_fmt = r->schema_fmt; + s->membw = r->membw; + + /* + * When mba_sc() is enabled the format used by user space is different + * to that expected by hardware. The conversion is done by + * update_mba_bw(). + */ + if (is_mba_sc(r)) { + s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->membw.min_bw = 0; + s->membw.max_bw = MBA_MAX_MBPS; + s->membw.bw_gran = 1; + } + + switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 3d96aa514d6bb..09b19b7b98914 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -80,10 +80,8 @@ static inline void resctrl_arch_enable_alloc(void) { } static inline void resctrl_arch_disable_alloc(void) { } struct resctrl_schema; - -struct rdt_resource; static inline u32 resctrl_arch_round_bw(u32 val, - const struct rdt_resource *r __always_unused) + const struct resctrl_schema *s __always_unused) { /* * Do nothing: for MPAM, resctrl_arch_update_one() has the necessary diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d6f68f7e65964..348da7c59425d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -359,9 +359,12 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. * @fmt_str: Format string to show domain value. + * @schema_fmt: Which format string and parser is used for this schema. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. + * @membw The properties of the schema which may be different to the format + * that was specified by the resource, * @num_closid: The number of closid that can be used with this schema. When * features like CDP are enabled, this will be lower than the * hardware supports for the resource. @@ -370,8 +373,10 @@ struct resctrl_schema { struct list_head list; char name[8]; const char *fmt_str; + enum resctrl_schema_fmt schema_fmt; enum resctrl_conf_type conf_type; struct rdt_resource *res; + struct resctrl_membw membw; u32 num_closid; }; @@ -421,6 +426,23 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) return WARN_ON_ONCE(1); } +/** + * resctrl_get_schema_default_ctrl() - Return the default control value for + * this schema. + * @s: The schema whose default control value is queried. + */ +static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) +{ + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return resctrl_get_resource_default_ctrl(s->res); + case RESCTRL_SCHEMA_RANGE: + return s->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); @@ -512,7 +534,7 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); */ int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); -u32 resctrl_arch_round_bw(u32 val, const struct rdt_resource *r); +u32 resctrl_arch_round_bw(u32 val, const struct resctrl_schema *s); /* * Update the ctrl_val and apply this config right now. From 646c7ae0b5b342c00fd251c50af34d7942a2f55a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 27 Sep 2024 17:59:15 +0100 Subject: [PATCH 091/115] NVIDIA: VR: SAUCE: fs/resctrl: Use schema format to check the resource is a bitmap rdtgroup_cbm_to_size() uses a WARN_ON_ONCE() to assert that the resource it has been passed is one of the L2 or L3 cache. This is to avoid using uninitialised bitmap properties. Updating this list for every resource that is configured by a bitmap doesn't scale. Instead change the WARN_ON_ONCE() to use the schema format the arch code requested for the resource. Signed-off-by: James Morse (cherry picked from commit 04f3b4e4e1fcd4fc02d59a4c7a27619f8abf4902 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 92045c18a1268..4ccbd5ba7d041 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1626,7 +1626,7 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct cacheinfo *ci; int num_b; - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + if (WARN_ON_ONCE(r->schema_fmt != RESCTRL_SCHEMA_BITMAP)) return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); @@ -1713,11 +1713,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else + + if (schema->schema_fmt == RESCTRL_SCHEMA_BITMAP) size = rdtgroup_cbm_to_size(r, d, ctrl); + else + size = ctrl; } seq_printf(s, "%d=%u", d->hdr.id, size); sep = true; From 9861237cab4c30cd4994ed2fa8d976f45fbd8f47 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:15:54 +0000 Subject: [PATCH 092/115] NVIDIA: VR: SAUCE: fs/resctrl: Add specific schema types for 'range' Resctrl allows the architecture code to specify the schema format for a control. Controls can either take a bitmap, or some kind of number. If user-space doesn't know what a control is by its name, it could be told the schema format. 'Some kind of number' isn't useful as the difference between a percentage and a value in MB/s affects how these would be programmed, even if resctrl's parsing code doesn't need to care. Add the types resctrl already has in addition to 'range'. This allows architectures to move over before 'range' is removed. These new schema formats are parsed the same, but will additionally affect which files are visible. Schema formats with a double underscore should not be considered portable between architectures, and are likely to be described to user-space as 'platform defined'. AMDs MBA resource is configured with an absolute bandwidth measured in multiples of one eighth of a GB per second. resctrl needs to be aware of this platform defined format to ensure the existing 'MB' files continue to be shown. Signed-off-by: James Morse (cherry picked from commit bb81e4805d5120058ec44f793780bdf1e775cd5a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 3 +++ fs/resctrl/rdtgroup.c | 3 +++ include/linux/resctrl.h | 12 ++++++++++++ 3 files changed, 18 insertions(+) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 1e51c4a01e785..ec925ce6c8773 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -220,6 +220,9 @@ static int parse_line(char *line, struct resctrl_schema *s, parse_ctrlval = &parse_cbm; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: parse_ctrlval = &parse_bw; break; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 4ccbd5ba7d041..112bd52d99f8a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2846,6 +2846,9 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type s->fmt_str = "%d=%x"; break; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: s->fmt_str = "%d=%u"; break; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 348da7c59425d..ca20e7ed32de2 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -285,10 +285,16 @@ enum resctrl_scope { * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. + * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. + * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, RESCTRL_SCHEMA_RANGE, + RESCTRL_SCHEMA_PERCENT, + RESCTRL_SCHEMA_MBPS, + RESCTRL_SCHEMA__AMD_MBA, }; /** @@ -420,6 +426,9 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return r->membw.max_bw; } @@ -437,6 +446,9 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); case RESCTRL_SCHEMA_RANGE: + case RESCTRL_SCHEMA_PERCENT: + case RESCTRL_SCHEMA_MBPS: + case RESCTRL_SCHEMA__AMD_MBA: return s->membw.max_bw; } From 89f91ff444482b6c793c1c21aeb4bd772cb73d11 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:19:37 +0000 Subject: [PATCH 093/115] NVIDIA: VR: SAUCE: x86/resctrl: Move over to specifying MBA control formats Resctrl specifies the schema format for MB and SMBA in rdt_resources_all[]. Intel platforms take a percentage for MB, AMD platforms take an absolute value which isn't MB/s. Currently these are both treated as a 'range'. Adding support for additional types of control shows that user-space needs to be told what the control formats are. Today users of resctrl must already know if their platform is Intel or AMD to know how the MB resource will behave. The MPAM support exposes new control types that take a 'percentage'. The Intel MB resource is also configured by a percentage, so should be able to expose this to user-space. Remove the static configuration for schema_fmt in rdt_resources_all[] and specify it with the other control properties in __get_mem_config_intel() or __get_mem_config_amd(). Signed-off-by: James Morse (cherry picked from commit 3323499e5df777ad2eb10be5c7dc29ae5358c93d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `arch/x86/kernel/cpu/resctrl/core.c`; ] Signed-off-by: Fenghua Yu --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 244f0d2a93e7a..00bc2e3ed0ff1 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -88,7 +88,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = @@ -97,7 +96,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), - .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_PERF_PKG] = @@ -211,6 +209,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->membw.max_bw = MAX_MBA_BW; r->mba.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { @@ -246,6 +245,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; + r->schema_fmt = RESCTRL_SCHEMA__AMD_MBA; r->membw.max_bw = 1 << eax; /* AMD does not use delay */ From 418b1eb61c2b9b2a1aef6d0fbd439a73ff26597e Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:30:08 +0000 Subject: [PATCH 094/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Convert MB resource to use percentage MPAMs bandwidth controls are both exposed to resctrl as if they take a percentage. Update the schema format so that user-space can be told this is a perentage, and files that describe this control format are exposed. (e.g. min_percent) Existing variation in this area is covered by requiring user-space to know if it is running on an Intel or AMD platform. Exposing the schema format directly will avoid modifying user-space to know it is running on an MPAM or RISCV platform. MPAM can also expose bitmap controls for memory bandwidth, which may become important for use-cases in the future. These are currently converted to a percentage to fit the existing definition of the MB resource. Signed-off-by: James Morse (cherry picked from commit 2baa164d3c899703f228ad0d2e9ad7d4856203e8 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 4e62aeb4334a7..e2e5d81cc18de 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1415,7 +1415,7 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->alloc_capable = true; break; case RDT_RESOURCE_MBA: - r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; r->mba.delay_linear = true; From 0eb3d97abc597d63f93e3d68eb480022d98dc972 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:32:43 +0000 Subject: [PATCH 095/115] NVIDIA: VR: SAUCE: fs/resctrl: Remove 'range' schema format Resctrl previously had a 'range' schema format that took some kind of number. This has since been split into percentage, MB/s and an AMD platform specific scheme. As range is no longer used, remove it. The last user is mba_sc which should be described as taking MB/s. Signed-off-by: James Morse (cherry picked from commit 6c8f021bc7f7070728763a9a5ddaee7d5f291099 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 1 - fs/resctrl/rdtgroup.c | 3 +-- include/linux/resctrl.h | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index ec925ce6c8773..8d00aeacc337c 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -219,7 +219,6 @@ static int parse_line(char *line, struct resctrl_schema *s, case RESCTRL_SCHEMA_BITMAP: parse_ctrlval = &parse_cbm; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 112bd52d99f8a..99d1dd74e544d 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2835,7 +2835,7 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type * update_mba_bw(). */ if (is_mba_sc(r)) { - s->schema_fmt = RESCTRL_SCHEMA_RANGE; + s->schema_fmt = RESCTRL_SCHEMA_MBPS; s->membw.min_bw = 0; s->membw.max_bw = MBA_MAX_MBPS; s->membw.bw_gran = 1; @@ -2845,7 +2845,6 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type case RESCTRL_SCHEMA_BITMAP: s->fmt_str = "%d=%x"; break; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ca20e7ed32de2..ed6d1b5b39619 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -284,14 +284,12 @@ enum resctrl_scope { /** * enum resctrl_schema_fmt - The format user-space provides for a schema. * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. - * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, - RESCTRL_SCHEMA_RANGE, RESCTRL_SCHEMA_PERCENT, RESCTRL_SCHEMA_MBPS, RESCTRL_SCHEMA__AMD_MBA, @@ -425,7 +423,6 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) switch (r->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: @@ -445,7 +442,6 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) switch (s->schema_fmt) { case RESCTRL_SCHEMA_BITMAP: return resctrl_get_resource_default_ctrl(s->res); - case RESCTRL_SCHEMA_RANGE: case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: From fa292d386ec975c401e7f1d103bb0866a4540de1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 15:49:06 +0000 Subject: [PATCH 096/115] NVIDIA: VR: SAUCE: fs/resctrl: Add additional files for percentage and bitmap controls MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. To find the minimum granularity of the percentage cache capacity controls, user-space is expected to rad the banwdidth_gran file, and know this has nothing to do with bandwidth. The only problem here is the name of the file. Add duplicates of these properties with percentage and bitmap in the name. These will be exposed based on the schema format. The existing files must remain tied to the specific resources so that they remain visible to user-space. Using the same helpers ensures the values will always be the same regardless of the file used. These files are not exposed until the new RFTYPE schema flags are set on a resource 'fflags'. Signed-off-by: James Morse (cherry picked from commit a38c11612e84a927e5b6e2dccf765291a4d498fd https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/internal.h`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/internal.h | 6 ++++++ fs/resctrl/rdtgroup.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 740e32a7c78e2..3c02cab6c94e8 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -252,6 +252,7 @@ struct rdtgroup { #define RFTYPE_TOP BIT(6) +/* files that are specific to a type of resource, e.g. throttle_mode */ #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) @@ -262,6 +263,11 @@ struct rdtgroup { #define RFTYPE_RES_PERF_PKG BIT(12) +/* files that are specific to a type of control, e.g. percent_min */ +#define RFTYPE_SCHEMA_BITMAP BIT(13) +#define RFTYPE_SCHEMA_PERCENT BIT(14) +#define RFTYPE_SCHEMA_MBPS BIT(15) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 99d1dd74e544d..1fb9da54bb534 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2028,6 +2028,13 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .seq_show = resctrl_num_mbm_cntrs_show, }, + { + .name = "bitmap_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "min_cbm_bits", .mode = 0444, @@ -2035,6 +2042,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_cbm_bits_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, }, + { + .name = "bitmaps_min_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_BITMAP, + }, { .name = "shareable_bits", .mode = 0444, @@ -2056,6 +2070,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_min_bw_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_min", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "bandwidth_gran", .mode = 0444, @@ -2063,6 +2084,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_bw_gran_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "percent_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_SCHEMA_PERCENT, + }, { .name = "delay_linear", .mode = 0444, From 3121c3ebd7691483f26b0684772a53a91bca312a Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 20 Nov 2024 16:55:39 +0000 Subject: [PATCH 097/115] NVIDIA: VR: SAUCE: fs/resctrl: Add fflags_from_schema() for files based on schema format MPAM has cache capacity controls that effectively take a percentage. Resctrl supports percentages, but the collection of files that are exposed to describe this control belong to the MB resource. New files have been added that are selected based on the schema format. Apply the flags to enable these files based on the schema format. Add a new fflags_from_schema() that is used for controls. Signed-off-by: James Morse (cherry picked from commit db005687c69b453ea63389314ba791dc9df18e1a https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 1fb9da54bb534..5c9495ffe7f96 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2527,7 +2527,35 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_PERF_PKG; } - return WARN_ON_ONCE(1); + return 0; +} + +static u32 fflags_from_schema(struct resctrl_schema *s) +{ + struct rdt_resource *r = s->res; + u32 fflags = 0; + + /* Some resources are configured purely from their rid */ + fflags |= fflags_from_resource(r); + if (fflags) + return fflags; + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + fflags |= RFTYPE_SCHEMA_BITMAP; + break; + case RESCTRL_SCHEMA_PERCENT: + fflags |= RFTYPE_SCHEMA_PERCENT; + break; + case RESCTRL_SCHEMA_MBPS: + fflags |= RFTYPE_SCHEMA_MBPS; + break; + case RESCTRL_SCHEMA__AMD_MBA: + /* No standard files are exposed */ + break; + } + + return fflags; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) @@ -2550,7 +2578,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + fflags = fflags_from_schema(s) | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; From c4b427ff8459c32d74153e1dc4e6943364aa11d4 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 10 Sep 2024 18:13:37 +0100 Subject: [PATCH 098/115] NVIDIA: VR: SAUCE: fs/resctrl: Expose the schema format to user-space If more schemas are added to resctrl, user-space needs to know how to configure them. To allow user-space to configure schema it doesn't know about, it would be helpful to tell user-space the format, e.g. percentage. Add a file under info that describes the schema format. Percentages and 'mbps' are implicitly decimal, bitmaps are expected to be in hex. Signed-off-by: James Morse (cherry picked from commit f0ae6915fc22fa0a7affd46f61e0fe4a7673df06 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5c9495ffe7f96..f6747cc62102a 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1787,6 +1787,30 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, return 0; } +static int resctrl_schema_format_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + switch (s->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + seq_puts(seq, "bitmap\n"); + break; + case RESCTRL_SCHEMA_PERCENT: + seq_puts(seq, "percentage\n"); + break; + case RESCTRL_SCHEMA_MBPS: + seq_puts(seq, "mbps\n"); + break; + /* The way these schema behave isn't discoverable from resctrl */ + case RESCTRL_SCHEMA__AMD_MBA: + seq_puts(seq, "platform\n"); + break; + } + + return 0; +} + static void mbm_config_write_domain(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { @@ -2243,6 +2267,14 @@ static struct rftype res_common_files[] = { .seq_show = rdtgroup_closid_show, .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, }, + { + .name = "schema_format", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_schema_format_show, + .fflags = RFTYPE_CTRL_INFO, + }, + }; static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) From 8f73325a1ac76799ac0dacd59d0def3e7b48f8e6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 12:35:13 +0000 Subject: [PATCH 099/115] NVIDIA: VR: SAUCE: fs/resctrl: Add L2 and L3 'MAX' resource schema MPAM can have both cache portion and cache capacity controls on any cache that supports MPAM. Cache portion bitmaps can be exposed via resctrl if they are implemented on L2 or L3. The cache capacity controls can not be used to isolate portions, which is in implicit in the L2 or L3 bitmap provided by user-space. These controls need to be configured with something more like a percentage. Add the resource enum entries for these two resources. No additional resctrl code is needed because the architecture code will specify this resource takes a 'percentage', re-using the support previously used only for the MB resource. Signed-off-by: James Morse (cherry picked from commit 2e9f961c2cad4bdcc49f1a598ee131725129337f https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `include/linux/resctrl.h`; ] Signed-off-by: Fenghua Yu --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ed6d1b5b39619..4bec4543ef6a5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -55,6 +55,8 @@ enum resctrl_res_level { RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, RDT_RESOURCE_PERF_PKG, + RDT_RESOURCE_L3_MAX, + RDT_RESOURCE_L2_MAX, /* Must be the last */ RDT_NUM_RESOURCES, From dfc5316eef3b2f2560e10c28d1a8cc1e300406b0 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 19 Nov 2024 11:51:03 +0000 Subject: [PATCH 100/115] NVIDIA: VR: SAUCE: arm_mpam: resctrl: Add the glue code to convert to/from cmax MPAM's maximum cache-capacity controls take a fixed point fraction format. Instead of dumping this on user-space, convert it to a percentage. User-space using resctrl already knows how to handle percentages. Signed-off-by: James Morse (cherry picked from commit 10caa1269560b1006811725d9564f0e859a53e2e https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 67 ++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index e2e5d81cc18de..5a5397e30edf6 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -764,6 +764,13 @@ static bool cache_has_usable_cpor(struct mpam_class *class) return class->props.cpbm_wd <= 32; } +static bool cache_has_usable_cmax(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + return mpam_has_feature(mpam_feat_cmax_cmax, cprops); +} + static bool mba_class_use_mbw_max(struct mpam_props *cprops) { return (mpam_has_feature(mpam_feat_mbw_max, cprops) && @@ -898,6 +905,11 @@ static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) return percent_to_fract16(pc, cprops->bwa_wd); } +static u16 percent_to_cmax(u8 pc, struct mpam_props *cprops) +{ + return percent_to_fract16(pc, cprops->cmax_wd); +} + static u32 get_mba_min(struct mpam_props *cprops) { if (!mba_class_use_mbw_max(cprops)) { @@ -1055,6 +1067,7 @@ static bool traffic_matches_l3(struct mpam_class *class) /* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ static void mpam_resctrl_pick_caches(void) { + bool has_cpor, has_cmax; struct mpam_class *class; struct mpam_resctrl_res *res; @@ -1073,7 +1086,9 @@ static void mpam_resctrl_pick_caches(void) continue; } - if (!cache_has_usable_cpor(class)) { + has_cpor = cache_has_usable_cpor(class); + has_cmax = cache_has_usable_cmax(class); + if (!has_cpor && !has_cmax) { pr_debug("class %u cache misses CPOR\n", class->level); continue; } @@ -1084,12 +1099,22 @@ static void mpam_resctrl_pick_caches(void) cpumask_pr_args(cpu_possible_mask)); continue; } - - if (class->level == 2) - res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; - else - res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - res->class = class; + if (has_cpor) { + pr_debug("pick_caches: Class has CPOR\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } + if (has_cmax) { + pr_debug("pick_caches: Class has CMAX\n"); + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2_MAX]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3_MAX]; + res->class = class; + } } } @@ -1413,6 +1438,23 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) */ r->cache.shareable_bits = resctrl_get_resource_default_ctrl(r); r->alloc_capable = true; + break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_PERCENT; + r->membw.min_bw = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.bw_gran = max(100 / (1 << cprops->cmax_wd), 1); + r->membw.max_bw = 100; + + if (r->rid == RDT_RESOURCE_L2_MAX) { + r->name = "L2_MAX"; + r->ctrl_scope = RESCTRL_L2_CACHE; + } else { + r->name = "L3_MAX"; + r->ctrl_scope = RESCTRL_L3_CACHE; + } + break; case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; @@ -1586,6 +1628,10 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3: configured_by = mpam_feat_cpor_part; break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + configured_by = mpam_feat_cmax_cmax; + break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { configured_by = mpam_feat_mbw_max; @@ -1603,6 +1649,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, switch (configured_by) { case mpam_feat_cpor_part: return cfg->cpbm; + case mpam_feat_cmax_cmax: + return fract16_to_percent(cfg->cmax, cprops->cmax_wd); case mpam_feat_mbw_max: return mbw_max_to_percent(cfg->mbw_max, cprops); default: @@ -1655,6 +1703,11 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, cfg.cpbm = cfg_val; mpam_set_feature(mpam_feat_cpor_part, &cfg); break; + case RDT_RESOURCE_L2_MAX: + case RDT_RESOURCE_L3_MAX: + cfg.cmax = percent_to_cmax(cfg_val, cprops); + mpam_set_feature(mpam_feat_cmax_cmax, &cfg); + break; case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); From a4c9c30922f7db758fab143e906b9c930282d578 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 17:14:47 -0600 Subject: [PATCH 101/115] NVIDIA: VR: SAUCE: arm_mpam: Avoid MSC teardown for the SW programming errors No need to destory MSC instance for the user/admin programming errors sicne it's not causing any functional issues. Signed-off-by: Shanker Donthineni (cherry picked from commit 7d348a2cf872998f094587434d1e6e61f2017445 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 773b5ed4d0113..8a782b53d8155 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2658,6 +2658,12 @@ static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) msc->id, mpam_errcode_names[errcode], partid, pmg, ris); + /* No action is required for the MPAM programming errors */ + if ((errcode != MPAM_ERRCODE_REQ_PARTID_RANGE) && + (errcode != MPAM_ERRCODE_REQ_PMG_RANGE)) { + return IRQ_HANDLED; + } + /* Disable this interrupt. */ mpam_disable_msc_ecr(msc); From be85015f769e3a779116dc7ba4f275cddb5a3ab3 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 13 May 2025 11:44:23 -0500 Subject: [PATCH 102/115] NVIDIA: VR: SAUCE: arm_mpam: Handle CPU-less numa nodes In a NUMA system, each node may include CPUs, memory, MPAM MSC instances, or any combination thereof. Some high-end servers may have NUMA nodes that include MPAM MSC but no CPUs. In such cases, associate all possible CPUs for those MSCs. Signed-off-by: Shanker Donthineni (cherry picked from commit 95f0fd86a3d4ff75ecda369136e905e329547dc1 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 8a782b53d8155..f7f94f9a6c375 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -622,6 +622,10 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, case MPAM_CLASS_MEMORY: get_cpumask_from_node_id(comp->comp_id, affinity); /* affinity may be empty for CPU-less memory nodes */ + if (cpumask_empty(affinity)) { + dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); + cpumask_copy(affinity, cpu_possible_mask); + } break; case MPAM_CLASS_UNKNOWN: return 0; From 7b957b75bab1f4bda82fd712444caf0a190eb6b5 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 24 Nov 2025 15:04:47 -0600 Subject: [PATCH 103/115] NVIDIA: VR: SAUCE: arm_mpam: Include all associated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current MPAM driver only considers the first component associated with an online/offline CPU during domain creation and teardown. This is insufficient, as CPU-initiated traffic may traverse multiple MSCs before reaching the target, and each MSC must be programmed consistently for proper resource partitioning. Update the MPAM driver to include all components associated with a given CPU during domain setup/teardown to expose expected schemata to userspace for effective resource control. Signed-off-by: Shanker Donthineni (forward ported from commit ac1e5be5e8fddc807e9c5bbc10da3797a601bc95 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Leaves drivers/resctrl/mpam_internal.h untouched; mpam_resctrl_offline_cpu() is already void in the baseline used here. - Tightens callers (mpam_resctrl_pick_mba, mpam_resctrl_pick_counters) around traffic_matches_l3() together with topology_matches_l3() and cpumask_equal(&class->affinity, cpu_possible_mask) and does not add a traffic_matches_l3() function body here, which is already defined in upstream. - Omits any edit to exposed_alloc_capable or exposed_mon_capable; those symbols are already absent from the baseline in favor of resctrl_arch_alloc_capable() / resctrl_arch_mon_capable(). - Adds for_each_mpam_resctrl_control() only; does not add MPAM_MAX_EVENT or a new for_each_mpam_resctrl_mon() / mpam_resctrl_counters[] sizing hunk because that monitor macro and array shape are already in the baseline. - Omits INIT_LIST_HEAD_RCU() on res->resctrl_res.ctrl_domains and mon_domains, omits moving mpam_resctrl_domain_insert() after resctrl_online_*(), and omits adding static void mpam_resctrl_online_domain_hdr(); that list setup and insert ordering are already in the baseline. - Does not replay a void→int conversion for mpam_resctrl_monitor_init() or a mpam_pmg_max + 1 num_rmid path; the baseline already has int-returning mpam_resctrl_monitor_init() and resctrl_arch_system_num_rmid_idx() for num_rmid, so only surrounding line context shifts in this file. - Adds for_each_mpam_resctrl_control(), mpam_resctrl_mon_from_res() / mpam_resctrl_res_from_mon(), mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r), extends mpam_resctrl_alloc_domain() / mpam_resctrl_get_domain_from_cpu() / mpam_resctrl_get_mon_domain_from_cpu() with struct mpam_component *comp, hardens topology_matches_l3() with matched_once, switches resctrl_arch_mbm_cntr_assign_enabled() to use mon->assigned_counters, and extends mpam_resctrl_pick_domain_id() so memory level > 3 uses component IDs like cache-backed classes] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 3 +- drivers/resctrl/mpam_resctrl.c | 240 +++++++++++++++++++++------------ fs/resctrl/internal.h | 9 +- fs/resctrl/monitor.c | 95 +++++++++---- fs/resctrl/rdtgroup.c | 26 ++-- 5 files changed, 243 insertions(+), 130 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index f7f94f9a6c375..25973a23aa8e4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -625,7 +625,8 @@ static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, if (cpumask_empty(affinity)) { dev_warn_once(&msc->pdev->dev, "CPU-less numa node"); cpumask_copy(affinity, cpu_possible_mask); - } + } else if (class->level > 3) + cpumask_copy(affinity, cpu_possible_mask); break; case MPAM_CLASS_UNKNOWN: return 0; diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 5a5397e30edf6..01176c0880b05 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -54,6 +54,11 @@ static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; eventid <= MPAM_MAX_EVENT; \ eventid++, mon = &mpam_resctrl_counters[eventid]) +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + /* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ static DEFINE_MUTEX(domain_list_lock); @@ -175,18 +180,48 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *l3) +static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + if (!res->class) + return NULL; + + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == res->class) + return mon; + } + return NULL; +} + +static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) { - l3->mon.num_mbm_cntrs = l3_num_allocated_mbwu; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + if (!mon->class) + return NULL; + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == mon->class) + return res; + } + return NULL; +} + +static void mpam_resctrl_monitor_sync_abmc_vals(struct rdt_resource *r) +{ + r->mon.num_mbm_cntrs = l3_num_allocated_mbwu; if (cdp_enabled) - l3->mon.num_mbm_cntrs /= 2; + r->mon.num_mbm_cntrs /= 2; - if (l3->mon.num_mbm_cntrs) { - l3->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); - l3->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); + if (r->mon.num_mbm_cntrs) { + r->mon.mbm_cntr_assignable = mpam_resctrl_abmc_enabled(); + r->mon.mbm_assign_on_mkdir = mpam_resctrl_abmc_enabled(); } else { - l3->mon.mbm_cntr_assignable = false; - l3->mon.mbm_assign_on_mkdir = false; + r->mon.mbm_cntr_assignable = false; + r->mon.mbm_assign_on_mkdir = false; } } @@ -957,10 +992,11 @@ static bool topology_matches_l3(struct mpam_class *victim) { int cpu, err; struct mpam_component *victim_iter; + bool matched_once = false; + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; lockdep_assert_cpus_held(); - cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) return false; @@ -974,8 +1010,8 @@ static bool topology_matches_l3(struct mpam_class *victim) } cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) - return false; + if (matched_once && (cpu >= nr_cpu_ids)) + continue; cpumask_clear(tmp_cpumask); err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); @@ -995,6 +1031,7 @@ static bool topology_matches_l3(struct mpam_class *victim) return false; } + matched_once = true; } return true; @@ -1146,13 +1183,15 @@ static void mpam_resctrl_pick_mba(void) continue; } - if (!topology_matches_l3(class)) { + if ((class->level == 3) && !topology_matches_l3(class)) { pr_debug("class %u topology doesn't match L3\n", class->level); continue; } - if (!traffic_matches_l3(class)) { + /* Check memory at egress from L3 for MSC with L3 */ + if (!cpumask_equal(&class->affinity, cpu_possible_mask) && + !traffic_matches_l3(class)) { pr_debug("class %u traffic doesn't match L3 egress\n", class->level); continue; @@ -1319,7 +1358,10 @@ static void mpam_resctrl_pick_counters(void) } has_mbwu = class_has_usable_mbwu(class); - if (has_mbwu && topology_matches_l3(class)) { + if (has_mbwu && + ((class->type == MPAM_CLASS_MEMORY) || + (topology_matches_l3(class) && + traffic_matches_l3(class)))) { pr_debug("class %u has usable MBWU, and matches L3 topology", class->level); /* @@ -1389,10 +1431,16 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain * bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { - if (r != &mpam_resctrl_controls[RDT_RESOURCE_L3].resctrl_res) + struct mpam_resctrl_res *res; + struct mpam_resctrl_mon *mon; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + + mon = mpam_resctrl_mon_from_res(res); + if (!mon) return false; - return mpam_resctrl_abmc_enabled(); + return mon->assigned_counters ? true : false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) @@ -1459,7 +1507,6 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; r->ctrl_scope = RESCTRL_L3_CACHE; - r->mba.delay_linear = true; r->mba.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = get_mba_min(cprops); @@ -1483,6 +1530,9 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) if (class->type == MPAM_CLASS_CACHE) return comp->comp_id; + if ((class->type == MPAM_CLASS_MEMORY) && (class->level > 3)) + return comp->comp_id; + if (topology_matches_l3(class)) { /* Use the corresponding L3 component ID as the domain ID */ int id = get_cpu_cacheinfo_id(cpu, 3); @@ -1504,10 +1554,10 @@ static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) */ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct mpam_resctrl_res *res = mpam_resctrl_res_from_mon(mon); size_t array_size = resctrl_arch_system_num_rmid_idx() * sizeof(int); int *rmid_array __free(kfree) = kmalloc(array_size, GFP_KERNEL); - struct rdt_resource *l3 = &res->resctrl_res; + struct rdt_resource *r = &res->resctrl_res; struct mpam_class *class = mon->class; u16 num_mbwu_mon; @@ -1528,7 +1578,7 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) return PTR_ERR(mon->assigned_counters); mon->mbwu_idx_to_mon = no_free_ptr(rmid_array); - mpam_resctrl_monitor_sync_abmc_vals(l3); + mpam_resctrl_monitor_sync_abmc_vals(r); return 0; } @@ -1536,8 +1586,15 @@ static int mpam_resctrl_monitor_init_abmc(struct mpam_resctrl_mon *mon) static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, enum resctrl_event_id type) { - struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; - struct rdt_resource *l3 = &res->resctrl_res; + struct mpam_resctrl_res *res; + struct rdt_resource *r; + + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + r = &res->resctrl_res; lockdep_assert_cpus_held(); @@ -1564,8 +1621,12 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, * monitoring class. * Setting name is necessary on monitor only platforms. */ - l3->name = "L3"; - l3->mon_scope = RESCTRL_L3_CACHE; + if ((mon->class->type == MPAM_CLASS_MEMORY) && (mon->class->level > 3)) { + r->name = "MB"; + } else { + r->name = "L3"; + } + r->mon_scope = RESCTRL_L3_CACHE; /* * num-rmid is the upper bound for the number of monitoring groups that @@ -1575,10 +1636,10 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, * this does mean userspace needs to know the architecture to correctly * interpret this value. */ - l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + r->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); if (resctrl_enable_mon_event(type, false, 0, NULL)) - l3->mon_capable = true; + r->mon_capable = true; switch (type) { case QOS_L3_MBM_LOCAL_EVENT_ID: @@ -1852,41 +1913,26 @@ static struct mpam_component *find_component(struct mpam_class *class, int cpu) } static struct mpam_resctrl_dom * -mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { int err; struct mpam_resctrl_dom *dom; struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; - struct mpam_class *class = res->class; - struct mpam_component *comp_iter, *ctrl_comp; struct rdt_resource *r = &res->resctrl_res; lockdep_assert_held(&domain_list_lock); - ctrl_comp = NULL; - guard(srcu)(&mpam_srcu); - list_for_each_entry_srcu(comp_iter, &class->components, class_list, - srcu_read_lock_held(&mpam_srcu)) { - if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { - ctrl_comp = comp_iter; - break; - } - } - - /* class has no component for this CPU */ - if (WARN_ON_ONCE(!ctrl_comp)) - return ERR_PTR(-EINVAL); - dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); if (!dom) return ERR_PTR(-ENOMEM); - if (r->alloc_capable) { - dom->ctrl_comp = ctrl_comp; + if (resctrl_arch_alloc_capable()) { + dom->ctrl_comp = comp; ctrl_d = &dom->resctrl_ctrl_dom; - mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + mpam_resctrl_domain_hdr_init(cpu, comp, r->rid, &ctrl_d->hdr); ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; err = resctrl_online_ctrl_domain(r, ctrl_d); if (err) @@ -1897,7 +1943,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) pr_debug("Skipped control domain online - no controls\n"); } - if (r->mon_capable) { + if (resctrl_arch_mon_capable()) { struct mpam_component *any_mon_comp = NULL; struct mpam_resctrl_mon *mon; enum resctrl_event_id eventid; @@ -1916,7 +1962,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) if (!mon->class) continue; // dummy resource - mon_comp = find_component(mon->class, cpu); + mon_comp = comp ? comp: find_component(mon->class, cpu); dom->mon_comp[eventid] = mon_comp; if (mon_comp) any_mon_comp = mon_comp; @@ -1942,7 +1988,7 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) return dom; offline_ctrl_domain: - if (r->alloc_capable) { + if (resctrl_arch_alloc_capable()) { mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); resctrl_offline_ctrl_domain(r, ctrl_d); } @@ -1960,7 +2006,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id * for anything that is not a cache. */ -static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +static struct mpam_resctrl_dom * +mpam_resctrl_get_mon_domain_from_cpu(int cpu, struct mpam_component *comp) { int cache_id; struct mpam_resctrl_dom *dom; @@ -1974,7 +2021,9 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) if (cache_id < 0) return NULL; - list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + list_for_each_entry(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (dom->resctrl_mon_dom.hdr.id == cache_id) return dom; } @@ -1983,7 +2032,8 @@ static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) } static struct mpam_resctrl_dom * -mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res, + struct mpam_component *comp) { struct mpam_resctrl_dom *dom; struct rdt_resource *r = &res->resctrl_res; @@ -1991,6 +2041,8 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) lockdep_assert_cpus_held(); list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (comp && (dom->ctrl_comp != comp)) + continue; if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) return dom; } @@ -1999,38 +2051,44 @@ mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) return NULL; /* Search the mon domain list too - needed on monitor only platforms. */ - return mpam_resctrl_get_mon_domain_from_cpu(cpu); + return mpam_resctrl_get_mon_domain_from_cpu(cpu, comp); } int mpam_resctrl_online_cpu(unsigned int cpu) { + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; struct mpam_resctrl_res *res; enum resctrl_res_level rid; + struct mpam_component *comp; guard(mutex)(&domain_list_lock); for_each_mpam_resctrl_control(res, rid) { struct mpam_resctrl_dom *dom; - struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy_resource; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (!dom) { - dom = mpam_resctrl_alloc_domain(cpu, res); - if (IS_ERR(dom)) - return PTR_ERR(dom); - } else { - if (r->alloc_capable) { - struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; - - mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); - } - if (r->mon_capable) { - struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; - - mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res, comp); + } else { + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } } + if (IS_ERR(dom)) + return PTR_ERR(dom); } } @@ -2041,6 +2099,7 @@ int mpam_resctrl_online_cpu(unsigned int cpu) void mpam_resctrl_offline_cpu(unsigned int cpu) { + struct mpam_component *comp; struct mpam_resctrl_res *res; enum resctrl_res_level rid; @@ -2052,35 +2111,38 @@ void mpam_resctrl_offline_cpu(unsigned int cpu) struct rdt_l3_mon_domain *mon_d; struct rdt_ctrl_domain *ctrl_d; bool ctrl_dom_empty, mon_dom_empty; - struct rdt_resource *r = &res->resctrl_res; if (!res->class) continue; // dummy resource - dom = mpam_resctrl_get_domain_from_cpu(cpu, res); - if (WARN_ON_ONCE(!dom)) - continue; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &res->class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &comp->affinity)) + continue; + dom = mpam_resctrl_get_domain_from_cpu(cpu, res, comp); + if (WARN_ON_ONCE(!dom)) + continue; - if (r->alloc_capable) { - ctrl_d = &dom->resctrl_ctrl_dom; - ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); - if (ctrl_dom_empty) - resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); - } else { ctrl_dom_empty = true; - } + if (resctrl_arch_alloc_capable()) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } - if (r->mon_capable) { - mon_d = &dom->resctrl_mon_dom; - mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); - if (mon_dom_empty) - resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); - } else { mon_dom_empty = true; - } + if (resctrl_arch_mon_capable()) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } - if (ctrl_dom_empty && mon_dom_empty) - kfree(dom); + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 3c02cab6c94e8..2e27efe08c27d 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -386,9 +386,9 @@ int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -int resctrl_l3_mon_resource_init(void); +int resctrl_mon_init(void); -void resctrl_l3_mon_resource_exit(void); +void resctrl_mon_exit(void); void mon_event_count(void *info); @@ -475,6 +475,11 @@ ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); u32 resctrl_io_alloc_closid(struct rdt_resource *r); +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 94da0360952d5..b664fb16b4c68 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -450,7 +450,7 @@ static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct mbm_state *m; u64 tval = 0; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) { rr->err = -EIO; return -EINVAL; } @@ -547,6 +547,7 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { switch (rr->r->rid) { case RDT_RESOURCE_L3: + case RDT_RESOURCE_MBA: WARN_ON_ONCE(rr->evt->any_cpu); if (rr->hdr) return __l3_mon_event_count(rdtgrp, rr); @@ -592,7 +593,7 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) struct rdt_l3_mon_domain *d; struct mbm_state *m; - if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, rr->r->rid)) return; d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); m = get_mbm_state(d, closid, rmid, rr->evt->evtid); @@ -1002,7 +1003,7 @@ void free_rmid_lru_list(void) */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), - MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_MBA, false), MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), @@ -1631,9 +1632,9 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, return ret; } -int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +static int mbm_assignments_show(struct kernfs_open_file *of, struct seq_file *s, + void *v, struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; @@ -1679,6 +1680,18 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi return ret; } +int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +int mbm_MB_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) +{ + return mbm_assignments_show(of, s, v, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + /* * mbm_get_mon_event_by_name() - Return the mon_evt entry for the matching * event name. @@ -1773,10 +1786,10 @@ static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup return -EINVAL; } -ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) +static ssize_t mbm_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdtgroup *rdtgrp; char *token, *event; int ret = 0; @@ -1818,6 +1831,20 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_L3)); +} + +ssize_t mbm_MB_assignments_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + return mbm_assignments_write(of, buf, nbytes, off, + resctrl_arch_get_resource(RDT_RESOURCE_MBA)); +} + static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) { if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { @@ -1858,7 +1885,7 @@ static void closid_num_dirty_rmid_free(void) } /** - * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. + * resctrl_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID @@ -1870,27 +1897,21 @@ static void closid_num_dirty_rmid_free(void) * * Return: 0 for success, or -ENOMEM. */ -int resctrl_l3_mon_resource_init(void) +static void resctrl_mon_resource_init(struct rdt_resource *r) { - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; + unsigned long fflags; - if (!r->mon_capable) - return 0; - - ret = closid_num_dirty_rmid_alloc(r); - if (ret) - return ret; + fflags = (r->rid == RDT_RESOURCE_MBA) ? RFTYPE_RES_MB :RFTYPE_RES_CACHE; if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); } if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) @@ -1908,19 +1929,43 @@ int resctrl_l3_mon_resource_init(void) NON_TEMP_WRITE_TO_LOCAL_MEM); r->mon.mbm_assign_on_mkdir = true; resctrl_file_fflags_init("num_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("available_mbm_cntrs", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + RFTYPE_MON_INFO | fflags); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | - RFTYPE_RES_CACHE); - resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + fflags); + if (r->rid == RDT_RESOURCE_MBA) + resctrl_file_fflags_init("mbm_MB_assignments", RFTYPE_MON_BASE); + else + resctrl_file_fflags_init("mbm_L3_assignments", RFTYPE_MON_BASE); + resctrl_file_fflags_init("mbm_assign_mode", RFTYPE_MON_INFO | + fflags); } +} + +int resctrl_mon_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = closid_num_dirty_rmid_alloc(r); + if (ret) + return ret; + + resctrl_mon_resource_init(r); + + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r) + resctrl_mon_resource_init(r); return 0; } -void resctrl_l3_mon_resource_exit(void) +void resctrl_mon_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f6747cc62102a..dbff838ab1285 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2183,6 +2183,13 @@ static struct rftype res_common_files[] = { .seq_show = mbm_L3_assignments_show, .write = mbm_L3_assignments_write, }, + { + .name = "mbm_MB_assignments", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_MB_assignments_show, + .write = mbm_MB_assignments_write, + }, { .name = "mbm_assign_mode", .mode = 0644, @@ -4583,10 +4590,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, hdr); - if (r->rid != RDT_RESOURCE_L3) - goto out_unlock; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4692,10 +4696,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr mutex_lock(&rdtgroup_mutex); - if (r->rid != RDT_RESOURCE_L3) - goto mkdir; - - if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, r->rid)) goto out_unlock; d = container_of(hdr, struct rdt_l3_mon_domain, hdr); @@ -4712,7 +4713,6 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); -mkdir: err = 0; /* * If the filesystem is not mounted then only the default resource group @@ -4818,13 +4818,13 @@ int resctrl_init(void) io_alloc_init(); - ret = resctrl_l3_mon_resource_init(); + ret = resctrl_mon_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4859,7 +4859,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); return ret; } @@ -4922,6 +4922,6 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_l3_mon_resource_exit(); + resctrl_mon_exit(); free_rmid_lru_list(); } From 3e04efc960a7104e8411cec17d8e9ab1b6b61606 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Wed, 4 Mar 2026 11:53:32 -0600 Subject: [PATCH 104/115] NVIDIA: VR: SAUCE: resctrl/mpam: reset RIS by applying explicit default config Reset an RIS by building a default mpam_config and applying it via mpam_reprogram_ris_partid(), like any other config. - mpam_init_reset_cfg(): set features and default values only for controls supported by the RIS (cpor_part, mbw_part, mbw_max, mbw_prop, cmax_cmax, cmax_cmin). Use full masks for CPBM/MBW_PBM and MPAMCFG_* defaults for MBW_MAX, CMAX, CMIN. - mpam_reprogram_ris_partid(): apply cfg for all supported controls (no separate reset path). Signed-off-by: Shanker Donthineni (forward ported from commit e0b6de09b2a78f7aa12400ee756e5e6564118578 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - reset_cpbm and reset_mbw_pbm are not used. no need to define them; - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 28 +++++++++++----------------- drivers/resctrl/mpam_resctrl.c | 18 ++---------------- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 25973a23aa8e4..6991342c40351 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1677,7 +1677,6 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) { u32 pri_val = 0; - u16 cmax = MPAMCFG_CMAX_CMAX; struct mpam_msc *msc = ris->vmsc->msc; struct mpam_props *rprops = &ris->props; u16 dspri = GENMASK(rprops->dspri_wd, 0); @@ -1735,25 +1734,18 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); - if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { - u32 cmax_val = cfg->cmax; + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops) && + mpam_has_feature(mpam_feat_cmax_cmax, cfg)) { + u32 cmax = cfg->cmax; - if (cfg->cmax_softlim) - cmax_val |= MPAMCFG_CMAX_SOFTLIM; - mpam_write_partsel_reg(msc, CMAX, cmax_val); - } else { - mpam_write_partsel_reg(msc, CMAX, cmax); - } + if (cfg->cmax_softlim) + cmax |= MPAMCFG_CMAX_SOFTLIM; + mpam_write_partsel_reg(msc, CMAX, cmax); } - if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) { - if (mpam_has_feature(mpam_feat_cmax_cmin, cfg)) { - mpam_write_partsel_reg(msc, CMIN, cfg->cmin); - } else { - mpam_write_partsel_reg(msc, CMIN, 0); - } - } + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops) && + mpam_has_feature(mpam_feat_cmax_cmin, cfg)) + mpam_write_partsel_reg(msc, CMIN, cfg->cmin); if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); @@ -2817,6 +2809,8 @@ static void mpam_reset_component_cfg(struct mpam_component *comp) comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0); if (cprops->bwa_wd) comp->cfg[i].mbw_max = GENMASK(15, 16 - cprops->bwa_wd); + if (cprops->cmax_wd) + comp->cfg[i].cmax = MPAMCFG_CMAX_CMAX; } } diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 01176c0880b05..f689ade7b2d5f 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -898,14 +898,7 @@ static u32 get_mba_granularity(struct mpam_props *cprops) */ static u32 fract16_to_percent(u16 fract, u8 wd) { - u32 val = fract; - - val >>= 16 - wd; - val += 1; - val *= MAX_MBA_BW; - val = DIV_ROUND_CLOSEST(val, 1 << wd); - - return val; + return DIV_ROUND_CLOSEST((fract + 1) * 100, 65536); } /* @@ -920,14 +913,7 @@ static u32 fract16_to_percent(u16 fract, u8 wd) */ static u16 percent_to_fract16(u8 pc, u8 wd) { - u32 val = pc; - - val <<= wd; - val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); - val = max(val, 1) - 1; - val <<= 16 - wd; - - return val; + return pc ? (((pc * 65536) / 100) - 1) : 0; } static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) From 2f8ad7b1230ad8f0551476e4f26de4f1a1451405 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 6 May 2025 21:02:21 -0500 Subject: [PATCH 105/115] NVIDIA: VR: SAUCE: iommu/arm-smmu-v3: Fix MPAM for indentity-mappings There is no struct arm_smmu_domain context for domains configured with identity mappings. Use the device to obtain the necessary information to program PARTID and PMGID. Signed-off-by: Shanker Donthineni (cherry picked from commit 23bcbda8e63535e6090ea00540e19742650edf59 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 73 +++++++-------------- drivers/iommu/iommu.c | 52 ++++++--------- include/linux/iommu.h | 4 +- 3 files changed, 45 insertions(+), 84 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b3047fd391b0d..2989ee64d15ba 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3741,94 +3741,67 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } -static int arm_smmu_group_set_mpam(struct iommu_group *group, u16 partid, +static int arm_smmu_group_set_mpam(struct device *dev, u16 partid, u8 pmg) { int i; u32 sid; - unsigned long flags; struct arm_smmu_ste *step; - struct iommu_domain *domain; struct arm_smmu_device *smmu; struct arm_smmu_master *master; struct arm_smmu_cmdq_batch cmds; - struct arm_smmu_domain *smmu_domain; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_STE, .cfgi = { .leaf = true, }, }; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; - smmu = smmu_domain->smmu; + smmu = master->smmu; arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - - for (i = 0; i < master->num_streams; i++) { - sid = master->streams[i].id; - step = arm_smmu_get_step_for_sid(smmu, sid); - - /* These need locking if the VMSPtr is ever used */ - step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); - step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); + for (i = 0; i < master->num_streams; i++) { + sid = master->streams[i].id; + step = arm_smmu_get_step_for_sid(smmu, sid); - cmd.cfgi.sid = sid; - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); - } + /* These need locking if the VMSPtr is ever used */ + step->data[4] = FIELD_PREP(STRTAB_STE_4_PARTID, partid); + step->data[5] = FIELD_PREP(STRTAB_STE_5_PMG, pmg); - master->partid = partid; - master->pmg = pmg; + cmd.cfgi.sid = sid; + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + + master->partid = partid; + master->pmg = pmg; arm_smmu_cmdq_batch_submit(smmu, &cmds); return 0; } -static int arm_smmu_group_get_mpam(struct iommu_group *group, u16 *partid, +static int arm_smmu_group_get_mpam(struct device *dev, u16 *partid, u8 *pmg) { - int err = -EINVAL; - unsigned long flags; - struct iommu_domain *domain; struct arm_smmu_master *master; - struct arm_smmu_domain *smmu_domain; - struct arm_smmu_master_domain *master_domain; - domain = iommu_get_domain_for_group(group); - smmu_domain = to_smmu_domain(domain); - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_MPAM)) + master = dev_iommu_priv_get(dev); + if (!(master->smmu->features & ARM_SMMU_FEAT_MPAM)) return -EIO; if (!partid && !pmg) return 0; - spin_lock_irqsave(&smmu_domain->devices_lock, flags); - list_for_each_entry(master_domain, &smmu_domain->devices, - devices_elm) { - master = master_domain->master; - if (master) { - if (partid) - *partid = master->partid; - if (pmg) - *pmg = master->pmg; - err = 0; - } - } - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + if (partid) + *partid = master->partid; + if (pmg) + *pmg = master->pmg; - return err; + return 0; } static const struct iommu_ops arm_smmu_ops = { diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5fd2f4d3beab2..08fe080690b28 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4156,25 +4156,19 @@ int iommu_group_set_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->set_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->set_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->set_group_qos_params(device->dev, partition, perf_mon_grp); + if (ret < 0) + break; } - - ret = ops->set_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; @@ -4194,25 +4188,19 @@ int iommu_group_get_qos_params(struct iommu_group *group, { const struct iommu_ops *ops; struct group_device *device; - int ret; + int ret = -ENODEV; mutex_lock(&group->mutex); - device = list_first_entry_or_null(&group->devices, typeof(*device), - list); - if (!device) { - ret = -ENODEV; - goto out_unlock; - } - - ops = dev_iommu_ops(device->dev); - if (!ops->get_group_qos_params) { - ret = -EOPNOTSUPP; - goto out_unlock; + for_each_group_device(group, device) { + ops = dev_iommu_ops(device->dev); + if (!ops->get_group_qos_params) { + ret = -EOPNOTSUPP; + break; + } + ret = ops->get_group_qos_params(device->dev, partition, perf_mon_grp); + if (!ret) + break; } - - ret = ops->get_group_qos_params(group, partition, perf_mon_grp); - -out_unlock: mutex_unlock(&group->mutex); return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c3ef55df73ccd..d9ba1e4567b9c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -704,9 +704,9 @@ struct iommu_ops { const struct iommu_user_data *user_data); /* Per group IOMMU features */ - int (*get_group_qos_params)(struct iommu_group *group, u16 *partition, + int (*get_group_qos_params)(struct device *dev, u16 *partition, u8 *perf_mon_grp); - int (*set_group_qos_params)(struct iommu_group *group, u16 partition, + int (*set_group_qos_params)(struct device *dev, u16 partition, u8 perf_mon_grp); const struct iommu_domain_ops *default_domain_ops; From b723c7e6de9b77db584162bd07cf8886ff8187c4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 106/115] NVIDIA: VR: SAUCE: resctrl: add MB_HLIM resource ID and schema type Add RDT_RESOURCE_MB_HLIM and RESCTRL_SCHEMA_MB_HLIM for per-domain MBW maximum hard-limit control on ARM MPAM. Document the schema in kernel-doc. Extend resctrl_get_default_ctrl() for RESCTRL_SCHEMA_MB_HLIM (default 0). (cherry picked from commit 9fc8c60cb42b519bb85c37ede8d44b847b18ae40 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- include/linux/resctrl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 4bec4543ef6a5..985074fa5ddb6 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -57,6 +57,7 @@ enum resctrl_res_level { RDT_RESOURCE_PERF_PKG, RDT_RESOURCE_L3_MAX, RDT_RESOURCE_L2_MAX, + RDT_RESOURCE_MB_HLIM, /* Must be the last */ RDT_NUM_RESOURCES, @@ -289,12 +290,15 @@ enum resctrl_scope { * @RESCTRL_SCHEMA_PERCENT: The schema is a percentage. * @RESCTRL_SCHEMA_MBPS: The schema ia a MBps value. * @RESCTRL_SCHEMA__AMD_MBA: The schema value is MBA for AMD platforms. + * @RESCTRL_SCHEMA_MB_HLIM: Per-domain MBW max hard limit (0/1), ARM MPAM only + * when MPAMF_MBW_IDR.MAX_LIM is 0b00 (HARDLIM RW). */ enum resctrl_schema_fmt { RESCTRL_SCHEMA_BITMAP, RESCTRL_SCHEMA_PERCENT, RESCTRL_SCHEMA_MBPS, RESCTRL_SCHEMA__AMD_MBA, + RESCTRL_SCHEMA_MB_HLIM, }; /** @@ -429,6 +433,8 @@ static inline u32 resctrl_get_resource_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: return r->membw.max_bw; + case RESCTRL_SCHEMA_MB_HLIM: + return 0; } return WARN_ON_ONCE(1); From 2c8a9379738d917a5dbfb193e7effe5c8c524405 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 107/115] NVIDIA: VR: SAUCE: resctrl: wire MB_HLIM schemata parsing and group setup Add mb_hlim parsing and validation (0/1), wire RESCTRL_SCHEMA_MB_HLIM into schemata and control-group setup, align MB_HLIM with MBA for exclusive mode and pseudo-locking, and default new groups to hardlim off. (cherry picked from commit 6911e81a21d23c2622c9dbf6e7b374d67dd4e672 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- fs/resctrl/ctrlmondata.c | 33 ++++++++++++++++++++++++++++++++- fs/resctrl/rdtgroup.c | 23 ++++++++++++++++++++++- include/linux/resctrl.h | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 8d00aeacc337c..138321730f73b 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -90,6 +90,33 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, return 0; } +static bool hlim_validate(char *buf, u32 *data) +{ + int ret = kstrtou32(buf, 10, data); + + if (ret || (*data != 0 && *data != 1)) { + rdt_last_cmd_printf("Invalid MB_HLIM value %s (expect 0 or 1)\n", buf); + return false; + } + return true; +} + +static int parse_mb_hlim(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 v; + + if (!hlim_validate(data->buf, &v)) + return -EINVAL; + + cfg = &d->staged_config[s->conf_type]; + cfg->new_ctrl = v; + cfg->have_new_ctrl = true; + + return 0; +} + /* * Check whether a cache bit mask is valid. * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: @@ -224,13 +251,17 @@ static int parse_line(char *line, struct resctrl_schema *s, case RESCTRL_SCHEMA__AMD_MBA: parse_ctrlval = &parse_bw; break; + case RESCTRL_SCHEMA_MB_HLIM: + parse_ctrlval = &parse_mb_hlim; + break; } if (WARN_ON_ONCE(!parse_ctrlval)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA || + r->rid == RDT_RESOURCE_MB_HLIM)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); return -EINVAL; } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index dbff838ab1285..f29fb759b789e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1509,7 +1509,8 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA || + r->rid == RDT_RESOURCE_MB_HLIM) continue; has_cache = true; list_for_each_entry(d, &r->ctrl_domains, hdr.list) { @@ -1806,6 +1807,9 @@ static int resctrl_schema_format_show(struct kernfs_open_file *of, case RESCTRL_SCHEMA__AMD_MBA: seq_puts(seq, "platform\n"); break; + case RESCTRL_SCHEMA_MB_HLIM: + seq_puts(seq, "0/1\n"); + break; } return 0; @@ -2590,6 +2594,7 @@ static u32 fflags_from_schema(struct resctrl_schema *s) fflags |= RFTYPE_SCHEMA_MBPS; break; case RESCTRL_SCHEMA__AMD_MBA: + case RESCTRL_SCHEMA_MB_HLIM: /* No standard files are exposed */ break; } @@ -2943,6 +2948,7 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type case RESCTRL_SCHEMA_PERCENT: case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: + case RESCTRL_SCHEMA_MB_HLIM: s->fmt_str = "%d=%u"; break; } @@ -3899,6 +3905,19 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } } +/* Initialize MB_HLIM resource with default hardlim off (0). */ +static void rdtgroup_init_mb_hlim(struct resctrl_schema *s) +{ + struct resctrl_staged_config *cfg; + struct rdt_ctrl_domain *d; + + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { + cfg = &d->staged_config[s->conf_type]; + cfg->new_ctrl = 0; + cfg->have_new_ctrl = true; + } +} + /* Initialize the RDT group's allocations. */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { @@ -3915,6 +3934,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) rdtgroup_init_mba(r, rdtgrp->closid); if (is_mba_sc(r)) continue; + } else if (r->rid == RDT_RESOURCE_MB_HLIM) { + rdtgroup_init_mb_hlim(s); } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 985074fa5ddb6..46a6f2c16deb4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -454,6 +454,8 @@ static inline u32 resctrl_get_schema_default_ctrl(struct resctrl_schema *s) case RESCTRL_SCHEMA_MBPS: case RESCTRL_SCHEMA__AMD_MBA: return s->membw.max_bw; + case RESCTRL_SCHEMA_MB_HLIM: + return 0; } return WARN_ON_ONCE(1); From 049ba6963a3aaae2ed6454e910d30fff9eac4a30 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:48 +0000 Subject: [PATCH 108/115] NVIDIA: VR: SAUCE: resctrl/mpam: probe MPAMF_MBW_IDR MAX_LIM and hardlim_rw Read mbw_max_lim from MPAMF_MBW_IDR.MAX_LIM when MBW_MAX is present, derive mpam_feat_mbw_max_hardlim_rw when both soft and hard limiting are supported, and merge mbw_max_lim and the feature across MSCs. Add mpam_props_sync_mbw_max_hardlim_rw() and propagate merged state in __props_mismatch(). (cherry picked from commit ced4d460ce56aa8b18d8ab3f19eeb76beeb25322 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 40 ++++++++++++++++++++++++++++++++- drivers/resctrl/mpam_internal.h | 7 ++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 6991342c40351..58885a32645bf 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -884,6 +884,19 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) #define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) +/* Align mpam_feat_mbw_max_hardlim_rw with MPAMF_MBW_IDR.MAX_LIM and mbw_max. */ +static void mpam_props_sync_mbw_max_hardlim_rw(struct mpam_props *props) +{ + if (!mpam_has_feature(mpam_feat_mbw_max, props)) { + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, props); + return; + } + if (props->mbw_max_lim == 0) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, props); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, props); +} + static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) { int err; @@ -931,6 +944,8 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) { u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR); + props->mbw_max_lim = 0; + /* portion bitmap resolution */ props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features); if (props->mbw_pbm_bits && @@ -945,14 +960,18 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) */ props->bwa_wd = min(props->bwa_wd, 16); - if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) { mpam_set_feature(mpam_feat_mbw_max, props); + props->mbw_max_lim = FIELD_GET(MPAMF_MBW_IDR_MAX_LIM, mbw_features); + } if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features)) mpam_set_feature(mpam_feat_mbw_min, props); if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features)) mpam_set_feature(mpam_feat_mbw_prop, props); + + mpam_props_sync_mbw_max_hardlim_rw(props); } /* Priority partitioning */ @@ -2382,12 +2401,31 @@ static void __props_mismatch(struct mpam_props *parent, if (alias && !mpam_has_bwa_wd_feature(parent) && mpam_has_bwa_wd_feature(child)) { parent->bwa_wd = child->bwa_wd; + parent->mbw_max_lim = child->mbw_max_lim; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, child)) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, parent); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, parent); } else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature, bwa_wd, alias)) { pr_debug("took the min bwa_wd\n"); parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); } + if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_max, alias)) { + parent->mbw_max_lim = child->mbw_max_lim; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, child)) + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, parent); + else + mpam_clear_feature(mpam_feat_mbw_max_hardlim_rw, parent); + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_max, + mbw_max_lim, alias)) { + pr_debug("%s mbw_max_lim mismatch, clearing mbw_max\n", __func__); + mpam_clear_feature(mpam_feat_mbw_max, parent); + parent->mbw_max_lim = 0; + mpam_props_sync_mbw_max_hardlim_rw(parent); + } + if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) { parent->cmax_wd = child->cmax_wd; } else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature, diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 3bc91db279381..65c0a76246985 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -213,6 +213,7 @@ enum mpam_device_features { mpam_feat_mbw_part, mpam_feat_mbw_min, mpam_feat_mbw_max, + mpam_feat_mbw_max_hardlim_rw, mpam_feat_mbw_prop, mpam_feat_intpri_part, mpam_feat_intpri_part_0_low, @@ -246,6 +247,11 @@ struct mpam_props { u16 dspri_wd; u16 num_csu_mon; u16 num_mbwu_mon; + /* + * MPAMF_MBW_IDR.MAX_LIM [1:0] when mpam_feat_mbw_max; else 0. + * 0 = soft+hard, 1 = soft only, 2 = hard only, 3 = reserved. + */ + u8 mbw_max_lim; /* * Kunit tests use memset() to set up feature combinations that should be @@ -639,6 +645,7 @@ static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } /* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */ #define MPAMF_MBW_IDR_BWA_WD GENMASK(5, 0) +#define MPAMF_MBW_IDR_MAX_LIM GENMASK(9, 8) #define MPAMF_MBW_IDR_HAS_MIN BIT(10) #define MPAMF_MBW_IDR_HAS_MAX BIT(11) #define MPAMF_MBW_IDR_HAS_PBM BIT(12) From 3fab9aa5901a1c28270d45bc7d719a999defe20b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 109/115] NVIDIA: VR: SAUCE: resctrl/mpam: track MBW max hard-limit in config Add mbw_max_hardlim to mpam_config. When reprogramming MBW_MAX, OR in HARDLIM (MPAMCFG_MBW_MAX bit 31) when either MBW_MAX or HARDLIM_RW features are active. Merge HARDLIM in mpam_update_config(). (forward ported from commit 114894b316e4f01d12979c66f58177b5af1b144d https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_devices.c`; - Resolve minor conflicts in `drivers/resctrl/mpam_internal.h`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_devices.c | 15 ++++++++++++--- drivers/resctrl/mpam_internal.h | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 58885a32645bf..76e73e61a8906 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -1744,10 +1744,17 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, } if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { - if (mpam_has_feature(mpam_feat_mbw_max, cfg)) - mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); - else + if (mpam_has_feature(mpam_feat_mbw_max, cfg) || + mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg)) { + u32 mbw_val = cfg->mbw_max; + + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg) && + cfg->mbw_max_hardlim) + mbw_val |= MPAMCFG_MBW_MAX_HARDLIM; + mpam_write_partsel_reg(msc, MBW_MAX, mbw_val); + } else { mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + } } if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) @@ -3260,6 +3267,8 @@ static bool mpam_update_config(struct mpam_config *cfg, maybe_update_config(cfg, mpam_feat_cmax_cmax, newcfg, cmax, has_changes); maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_max_hardlim_rw, newcfg, + mbw_max_hardlim, has_changes); return has_changes; } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 65c0a76246985..e05a05f91d5a9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -368,6 +368,7 @@ struct mpam_config { u32 cpbm; u32 mbw_pbm; u16 mbw_max; + bool mbw_max_hardlim; u16 cmax; u16 cmin; From 6353badbca863ad3c2c508dfb5b88b03e1c1d837 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:42 +0000 Subject: [PATCH 110/115] NVIDIA: VR: SAUCE: resctrl/mpam: bind MB_HLIM resource to MBA MPAM class When mpam_feat_mbw_max_hardlim_rw is present, share the MBA MPAM class with RDT_RESOURCE_MB_HLIM. Wire resctrl_arch_get_config and resctrl_arch_update_one for MB_HLIM, and tear down MBA and MB_HLIM controls together on class removal. Handle RDT_RESOURCE_MB_HLIM in fflags_from_resource() so creating info/MB_HLIM at resctrl mount does not hit the unhandled-rid WARN. (forward ported from commit 04e2ea34385215db0e00716ad4553c15084382f6 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 53 ++++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 2 ++ 2 files changed, 55 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index f689ade7b2d5f..cab22e924e986 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1198,6 +1198,13 @@ static void mpam_resctrl_pick_mba(void) candidate_class->level); res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; res->class = candidate_class; + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, + &candidate_class->props)) { + struct mpam_resctrl_res *mbh = + &mpam_resctrl_controls[RDT_RESOURCE_MB_HLIM]; + + mbh->class = candidate_class; + } } } @@ -1489,6 +1496,14 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->ctrl_scope = RESCTRL_L3_CACHE; } + break; + case RDT_RESOURCE_MB_HLIM: + if (!mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cprops)) + break; + r->alloc_capable = true; + r->schema_fmt = RESCTRL_SCHEMA_MB_HLIM; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->name = "MB_HLIM"; break; case RDT_RESOURCE_MBA: r->schema_fmt = RESCTRL_SCHEMA_PERCENT; @@ -1640,6 +1655,33 @@ static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, return 0; } +/* MB_HLIM schemata read: 0/1 per domain for current closid. */ +static u32 mpam_read_mbw_max_hardlim(struct rdt_resource *r, struct rdt_ctrl_domain *dom, + u32 closid, enum resctrl_conf_type type) +{ + struct mpam_resctrl_dom *m_dom; + struct mpam_config *cfg; + u32 partid; + + if (!mpam_is_enabled() || r->rid != RDT_RESOURCE_MB_HLIM) + return 0; + + partid = resctrl_get_config_index(closid, type); + if (partid >= resctrl_arch_get_num_closid(r)) + return 0; + + m_dom = container_of(dom, struct mpam_resctrl_dom, resctrl_ctrl_dom); + if (!m_dom->ctrl_comp || !m_dom->ctrl_comp->cfg) + return 0; + + cfg = &m_dom->ctrl_comp->cfg[partid]; + if (!mpam_has_feature(mpam_feat_mbw_max, cfg) && + !mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cfg)) + return 0; + + return cfg->mbw_max_hardlim ? 1 : 0; +} + u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -1679,6 +1721,8 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, case RDT_RESOURCE_L3_MAX: configured_by = mpam_feat_cmax_cmax; break; + case RDT_RESOURCE_MB_HLIM: + return mpam_read_mbw_max_hardlim(r, d, closid, type); case RDT_RESOURCE_MBA: if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { configured_by = mpam_feat_mbw_max; @@ -1761,6 +1805,15 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, mpam_set_feature(mpam_feat_mbw_max, &cfg); break; } + return -EINVAL; + case RDT_RESOURCE_MB_HLIM: + if (mpam_has_feature(mpam_feat_mbw_max_hardlim_rw, cprops) && + mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max_hardlim = cfg_val != 0; + mpam_set_feature(mpam_feat_mbw_max_hardlim_rw, &cfg); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } fallthrough; default: return -EINVAL; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index f29fb759b789e..cc637c9aaed98 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -2568,6 +2568,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_MB; case RDT_RESOURCE_PERF_PKG: return RFTYPE_RES_PERF_PKG; + case RDT_RESOURCE_MB_HLIM: + return 0; } return 0; From 1d481bd718ca9e25429cd35b83d8c5452999b359 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 111/115] NVIDIA: VR: SAUCE: resctrl: add membw.mb_max_lim and arch_has_mb_max_lim Add mb_max_lim and arch_has_mb_max_lim to struct resctrl_membw with kernel-doc: MPAM MAX_LIM encoding (MPAMF_MBW_IDR), invalid elsewhere unless arch sets arch_has_mb_max_lim. (cherry picked from commit 7939f1e41019d139cc11f2b93d6e8bd7ef4663d1 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- include/linux/resctrl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 46a6f2c16deb4..2287b7a3ee342 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -251,11 +251,15 @@ enum membw_throttle_mode { * @min_bw: Minimum memory bandwidth percentage user can request * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated + * @mb_max_lim: MPAM MAX_LIM encoding (MPAMF_MBW_IDR); invalid elsewhere + * @arch_has_mb_max_lim:True if mb_max_lim is supported */ struct resctrl_membw { u32 min_bw; u32 max_bw; u32 bw_gran; + u8 mb_max_lim; + bool arch_has_mb_max_lim; }; /** From 6fcf89fe046bc46db63358acd76544c33087740b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 112/115] NVIDIA: VR: SAUCE: resctrl/mpam: populate MBA mb_max_lim from MPAM probe When mpam_feat_mbw_max is present, set membw.mb_max_lim from cprops->mbw_max_lim and membw.arch_has_mb_max_lim so generic resctrl can expose max_lim. (forward ported from commit 05b9bc18310115182b70ec063229f20d601dc32c https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `drivers/resctrl/mpam_resctrl.c`; ] Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index cab22e924e986..06bc97a8d8a8d 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -1514,6 +1514,11 @@ static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) r->membw.max_bw = MAX_MBA_BW; r->membw.bw_gran = get_mba_granularity(cprops); + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + r->membw.mb_max_lim = cprops->mbw_max_lim; + r->membw.arch_has_mb_max_lim = true; + } + r->name = "MB"; r->alloc_capable = true; break; From cb558bd74005f6c55b1a6a4f4c52bde74d3a7e83 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:47 +0000 Subject: [PATCH 113/115] NVIDIA: VR: SAUCE: resctrl: add MBA max_lim sysfs and visibility from init Add rdt_mb_max_lim_show() and the max_lim entry in res_common_files[]. Add mb_max_lim_init() to call resctrl_file_fflags_init("max_lim", ...) when arch_has_mb_max_lim, and invoke it from resctrl_init() after io_alloc_init(). (forward ported from commit be13cad9cd1530470c9c7d96cb3665f7da8e1873 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) [fenghuay: - Resolve minor conflicts in `fs/resctrl/rdtgroup.c`; ] Signed-off-by: Fenghua Yu --- fs/resctrl/rdtgroup.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index cc637c9aaed98..f761a4fc948b1 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1298,6 +1298,17 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int rdt_mb_max_lim_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%d\n", r->membw.mb_max_lim); + + return 0; +} + static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -2126,6 +2137,12 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, }, + { + .name = "max_lim", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mb_max_lim_show, + }, /* * Platform specific which (if any) capabilities are provided by * thread_throttle_mode. Defer "fflags" initialization to platform @@ -2374,6 +2391,17 @@ static void io_alloc_init(void) } } +/* The resctrl file "max_lim" is added using MB resource if visible. */ +static void mb_max_lim_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + if (!r->membw.arch_has_mb_max_lim) + return; + + resctrl_file_fflags_init("max_lim", RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -4841,6 +4869,8 @@ int resctrl_init(void) io_alloc_init(); + mb_max_lim_init(); + ret = resctrl_mon_init(); if (ret) return ret; From 65d7ed7fac3aa302a930064df6d6bf1312d5173a Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 6 Apr 2026 01:34:48 +0000 Subject: [PATCH 114/115] NVIDIA: VR: SAUCE: Documentation: resctrl: document max_lim and MB_HLIM for MPAM MBA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the MBA max_lim sysfs file, MB_HLIM schemata (0/1 per domain), and how they relate to MPAM MBW_MAX, HARDLIM, and MPAMF_MBW_IDR.MAX_LIM. Add schema_format for mb_hlim under the MB allocation info directory. max_lim is exposed as a single decimal integer (MPAMF_MBW_IDR.MAX_LIM [1:0], 0–3), matching rdt_mb_max_lim_show(). MB_HLIM appears when the probe treats HARDLIM as read/write, which this series ties to max_lim reading zero (see mpam_props_sync_mbw_max_hardlim_rw()). (cherry picked from commit 93e1b6a2b3729e1e2db818036829c498378b4f16 https://github.com/NVIDIA/NV-Kernels 24.04_linux-nvidia-6.17-next) Signed-off-by: Fenghua Yu --- Documentation/filesystems/resctrl.rst | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index e9ff59c2e57e8..26739383c0156 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -247,6 +247,19 @@ with respect to allocation: non-linear. This field is purely informational only. +"max_lim": + Read-only. On ARM MPAM systems where MBA exposes MBW_MAX, this + file contains a single decimal integer: the + ``MPAMF_MBW_IDR.MAX_LIM`` field [1:0] (values ``0``–``3``) as probed for the MBA resource. + The file appears only when the platform supports MBA MBW_MAX and + the MAX_LIM value is available; otherwise it is not listed. + + The Arm MPAM architecture defines the meaning of each MAX_LIM + encoding. In this kernel, when ``max_lim`` reads ``0``, the + driver treats the ``HARDLIM`` bit of ``MPAMCFG_MBW_MAX`` as + read/write and an optional ``MB_HLIM`` line may appear in + ``schemata``. When ``max_lim`` is nonzero, ``MB_HLIM`` is omitted. + "thread_throttle_mode": Indicator on Intel systems of how tasks running on threads of a physical core are throttled in cases where they @@ -963,6 +976,27 @@ Memory bandwidth domain is L3 cache. MB:=bw_MiBps0;=bw_MiBps1;... +MBW maximum hard limit (ARM MPAM) +--------------------------------- +On some ARM systems, resctrl memory bandwidth allocation uses MPAM +maximum bandwidth (MBW_MAX). When ``max_lim`` reads ``0`` (see ``max_lim`` +under the ``MB`` allocation ``info`` directory), an additional schemata +line selects the ``HARDLIM`` bit for ``MPAMCFG_MBW_MAX`` independently of +the numeric limit on the ``MB`` line. + +The line uses the same cache/domain indices as ``MB``. Each value must +be ``0`` or ``1``: ``0`` clears HARDLIM (soft-limit behaviour for the +max), ``1`` sets HARDLIM (hard limit). When ``max_lim`` is nonzero or +``MB_HLIM`` is not supported for the platform, the line is omitted from +``schemata``. + +Format: +:: + + MB_HLIM:=0|1;=0|1;... + +The corresponding ``schema_format`` entry under ``info`` is ``mb_hlim``. + Slow Memory Bandwidth Allocation (SMBA) --------------------------------------- AMD hardware supports Slow Memory Bandwidth Allocation (SMBA). From 475f355d0408698d9f632ba3b9a5dfb1f91e928b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 13 May 2026 23:30:03 +0000 Subject: [PATCH 115/115] NVIDIA: SAUCE: Fix mbm_L3_assign and mon_local_bytes Add local bytes counter in mpam_resctrl_counters[] to fix missing mbm_local_bytes monitoring on Grace. Add mon->assigned_counters check to enable mbm_L3_assignments config file on Grace. Signed-off-by: Fenghua Yu --- drivers/resctrl/mpam_resctrl.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c index 06bc97a8d8a8d..d30efff32bc7b 100644 --- a/drivers/resctrl/mpam_resctrl.c +++ b/drivers/resctrl/mpam_resctrl.c @@ -46,7 +46,7 @@ static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; * to those supported by MPAM. * Class pointer may be NULL. */ -#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +#define MPAM_MAX_EVENT QOS_L3_MBM_LOCAL_EVENT_ID static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; #define for_each_mpam_resctrl_mon(mon, eventid) \ @@ -180,21 +180,6 @@ static void resctrl_reset_task_closids(void) read_unlock(&tasklist_lock); } -static struct mpam_resctrl_mon *mpam_resctrl_mon_from_res(struct mpam_resctrl_res *res) -{ - struct mpam_resctrl_mon *mon; - enum resctrl_event_id eventid; - - if (!res->class) - return NULL; - - for_each_mpam_resctrl_mon(mon, eventid) { - if (mon->class == res->class) - return mon; - } - return NULL; -} - static struct mpam_resctrl_res *mpam_resctrl_res_from_mon(struct mpam_resctrl_mon *mon) { struct mpam_resctrl_res *res; @@ -1425,15 +1410,22 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain * bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { struct mpam_resctrl_res *res; - struct mpam_resctrl_mon *mon; + enum resctrl_event_id evt; res = container_of(r, struct mpam_resctrl_res, resctrl_res); - mon = mpam_resctrl_mon_from_res(res); - if (!mon) - return false; + /* OCCUP shares the L3 class but has no MBWU assigned_counters. */ + for (evt = QOS_L3_MBM_TOTAL_EVENT_ID; evt <= QOS_L3_MBM_LOCAL_EVENT_ID; + evt++) { + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evt]; - return mon->assigned_counters ? true : false; + if (!mon->assigned_counters) + continue; + if (mpam_resctrl_res_from_mon(mon) == res) + return true; + } + + return false; } int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)