|
| 1 | +From 711b034c65bd0bec9c8c1c3862e61d7677e2e8fc Mon Sep 17 00:00:00 2001 |
| 2 | +From: Vertex X7-53 <vertex@glassway.net> |
| 3 | +Date: Sun, 17 Aug 2025 01:34:00 +0100 |
| 4 | +Subject: [PATCH] xen/pciback: Improve runtime power management |
| 5 | + |
| 6 | +An important part of S0ix runtime power management is the control of PCI device D-states. |
| 7 | +Without both the device and any applicable PCI bridges in D3cold, the PMC will |
| 8 | +keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2. |
| 9 | + |
| 10 | +The vast majority of devices depend on PME (Power Management Events) to |
| 11 | +wake from D3cold, so Linux will not attempt to put them into deeper |
| 12 | +sleep states if it detects the device does not support PME. |
| 13 | +PMEs can be delivered a variety of different ways, which include interrupts |
| 14 | +on the pcieport, ACPI events, and the setting of the PME status register in |
| 15 | +the PCI configuration space. Up until now, Xen has not supported the |
| 16 | +passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space. |
| 17 | + |
| 18 | +This first patch is a modification to the dom0 kernel, specifically pciback. |
| 19 | +We enable support for runtime PM in pciback, to allow the dom0 kernel |
| 20 | +to suspend upstream bridges. Then we allow domains to read PME capability registers. |
| 21 | +When dom0 receives a PME, it forwards this to pciback, and pciback then sets |
| 22 | +a special emulated flag on the device. This flag is cleared by the guest when it |
| 23 | +resets the register to 0, after handling the event. We also respond to requests |
| 24 | +from the guest to change the power state and place pciback in a PM state |
| 25 | +in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports. |
| 26 | +--- |
| 27 | + .../xen/xen-pciback/conf_space_capability.c | 131 ++++++++++++------ |
| 28 | + drivers/xen/xen-pciback/pci_stub.c | 54 ++++---- |
| 29 | + drivers/xen/xen-pciback/pciback.h | 2 + |
| 30 | + 3 files changed, 120 insertions(+), 67 deletions(-) |
| 31 | + |
| 32 | +diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c |
| 33 | +index cf568e899ee2..ce93964fefad 100644 |
| 34 | +--- a/drivers/xen/xen-pciback/conf_space_capability.c |
| 35 | ++++ b/drivers/xen/xen-pciback/conf_space_capability.c |
| 36 | +@@ -8,8 +8,12 @@ |
| 37 | + |
| 38 | + #include <linux/kernel.h> |
| 39 | + #include <linux/pci.h> |
| 40 | ++#include <linux/pm.h> |
| 41 | ++#include <linux/pm_runtime.h> |
| 42 | ++#include <linux/pm_wakeup.h> |
| 43 | + #include "pciback.h" |
| 44 | + #include "conf_space.h" |
| 45 | ++#include "../../pci/pci.h" |
| 46 | + |
| 47 | + static LIST_HEAD(capabilities); |
| 48 | + struct xen_pcibk_config_capability { |
| 49 | +@@ -91,39 +95,108 @@ static const struct config_field caplist_vpd[] = { |
| 50 | + {} |
| 51 | + }; |
| 52 | + |
| 53 | +-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, |
| 54 | ++static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value, |
| 55 | + void *data) |
| 56 | + { |
| 57 | +- int err; |
| 58 | + u16 real_value; |
| 59 | + |
| 60 | +- err = pci_read_config_word(dev, offset, &real_value); |
| 61 | +- if (err) |
| 62 | +- goto out; |
| 63 | ++ pm_runtime_barrier(&dev->dev); |
| 64 | + |
| 65 | +- *value = real_value & ~PCI_PM_CAP_PME_MASK; |
| 66 | ++ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); |
| 67 | + |
| 68 | +-out: |
| 69 | +- return err; |
| 70 | ++ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI. |
| 71 | ++ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command, |
| 72 | ++ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions. |
| 73 | ++ */ |
| 74 | ++ pci_read_config_word(dev, offset, &real_value); |
| 75 | ++ if (PCI_POSSIBLE_ERROR(real_value)) |
| 76 | ++ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */ |
| 77 | ++ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET; |
| 78 | ++ |
| 79 | ++ if (dev_data->pme_enabled) |
| 80 | ++ real_value |= PCI_PM_CTRL_PME_ENABLE; |
| 81 | ++ if (dev_data->pme_status) |
| 82 | ++ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE); |
| 83 | ++ |
| 84 | ++ *value = real_value; |
| 85 | ++ |
| 86 | ++ return 0; |
| 87 | + } |
| 88 | + |
| 89 | +-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. |
| 90 | +- * Can't allow driver domain to enable PMEs - they're shared */ |
| 91 | +-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) |
| 92 | ++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */ |
| 93 | ++/* The guest doesn't actually get to write to the PME_ENABLE register, the host does this in pm suspend */ |
| 94 | ++#define PM_OK_BITS PCI_PM_CTRL_DATA_SEL_MASK |
| 95 | + |
| 96 | + static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, |
| 97 | + void *data) |
| 98 | + { |
| 99 | + int err; |
| 100 | ++ int pm_err; |
| 101 | + u16 old_value; |
| 102 | + pci_power_t new_state; |
| 103 | ++ pci_power_t current_state; |
| 104 | ++ |
| 105 | ++ pm_runtime_barrier(&dev->dev); |
| 106 | ++ |
| 107 | ++ /* PME status is RW1CS */ |
| 108 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); |
| 109 | ++ if (new_value & PCI_PM_CTRL_PME_STATUS) { |
| 110 | ++ dev_data->pme_status = 0; |
| 111 | ++ } |
| 112 | ++ |
| 113 | ++ bool pme_request = new_value & PCI_PM_CTRL_PME_ENABLE; |
| 114 | ++ bool pme_changed = !!dev_data->pme_enabled != pme_request; |
| 115 | ++ if (pme_changed) |
| 116 | ++ dev_data->pme_enabled = pme_request; |
| 117 | ++ |
| 118 | ++ if (pme_changed && pme_request) { |
| 119 | ++ dev_dbg(&dev->dev, "PME commanded to enabled\n"); |
| 120 | ++ pm_runtime_put_noidle(&dev->dev); |
| 121 | ++ } |
| 122 | ++ |
| 123 | ++ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); |
| 124 | ++ |
| 125 | ++ /* First, use pm ops to transition state */ |
| 126 | ++ if (dev->current_state != new_state) |
| 127 | ++ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state); |
| 128 | ++ |
| 129 | ++ bool runtime_pm = pm_runtime_enabled(&dev->dev); |
| 130 | ++ if (runtime_pm) { |
| 131 | ++ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) { |
| 132 | ++ pm_err = pm_runtime_resume(&dev->dev); |
| 133 | ++ if (pm_err < 0) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err); |
| 134 | ++ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) { |
| 135 | ++ pm_err = pm_runtime_suspend(&dev->dev); |
| 136 | ++ if (pm_err < 0) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err); |
| 137 | ++ } |
| 138 | ++ } |
| 139 | ++ |
| 140 | ++ current_state = dev->current_state; |
| 141 | ++ if (current_state == PCI_D3cold) |
| 142 | ++ current_state = PCI_D3hot; |
| 143 | ++ |
| 144 | ++ /* Otherwise, set it manually */ |
| 145 | ++ if (!runtime_pm || current_state != new_state) { |
| 146 | ++ err = pci_set_power_state(dev, new_state); |
| 147 | ++ if (err) { |
| 148 | ++ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err); |
| 149 | ++ err = PCIBIOS_SET_FAILED; |
| 150 | ++ goto out; |
| 151 | ++ } |
| 152 | ++ } |
| 153 | + |
| 154 | ++ if (pme_changed && !pme_request) { |
| 155 | ++ /* Prevent ACPI from enabling suspend during runtime PM logic |
| 156 | ++ * If we disable runtime power management, we must also wake up the device */ |
| 157 | ++ dev_dbg(&dev->dev, "PME commanded to disabled\n"); |
| 158 | ++ pm_runtime_resume_and_get(&dev->dev); |
| 159 | ++ } |
| 160 | ++ |
| 161 | ++ /* This must happen here, after pm_runtime_resume is called */ |
| 162 | + err = pci_read_config_word(dev, offset, &old_value); |
| 163 | + if (err) |
| 164 | + goto out; |
| 165 | + |
| 166 | +- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); |
| 167 | +- |
| 168 | + new_value &= PM_OK_BITS; |
| 169 | + if ((old_value & PM_OK_BITS) != new_value) { |
| 170 | + new_value = (old_value & ~PM_OK_BITS) | new_value; |
| 171 | +@@ -132,48 +205,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, |
| 172 | + goto out; |
| 173 | + } |
| 174 | + |
| 175 | +- /* Let pci core handle the power management change */ |
| 176 | +- dev_dbg(&dev->dev, "set power state to %x\n", new_state); |
| 177 | +- err = pci_set_power_state(dev, new_state); |
| 178 | +- if (err) { |
| 179 | +- err = PCIBIOS_SET_FAILED; |
| 180 | +- goto out; |
| 181 | +- } |
| 182 | +- |
| 183 | + out: |
| 184 | + return err; |
| 185 | + } |
| 186 | + |
| 187 | +-/* Ensure PMEs are disabled */ |
| 188 | +-static void *pm_ctrl_init(struct pci_dev *dev, int offset) |
| 189 | +-{ |
| 190 | +- int err; |
| 191 | +- u16 value; |
| 192 | +- |
| 193 | +- err = pci_read_config_word(dev, offset, &value); |
| 194 | +- if (err) |
| 195 | +- goto out; |
| 196 | +- |
| 197 | +- if (value & PCI_PM_CTRL_PME_ENABLE) { |
| 198 | +- value &= ~PCI_PM_CTRL_PME_ENABLE; |
| 199 | +- err = pci_write_config_word(dev, offset, value); |
| 200 | +- } |
| 201 | +- |
| 202 | +-out: |
| 203 | +- return err ? ERR_PTR(err) : NULL; |
| 204 | +-} |
| 205 | +- |
| 206 | + static const struct config_field caplist_pm[] = { |
| 207 | + { |
| 208 | + .offset = PCI_PM_PMC, |
| 209 | + .size = 2, |
| 210 | +- .u.w.read = pm_caps_read, |
| 211 | ++ .u.w.read = xen_pcibk_read_config_word, |
| 212 | + }, |
| 213 | + { |
| 214 | + .offset = PCI_PM_CTRL, |
| 215 | + .size = 2, |
| 216 | +- .init = pm_ctrl_init, |
| 217 | +- .u.w.read = xen_pcibk_read_config_word, |
| 218 | ++ .u.w.read = pm_ctrl_read, |
| 219 | + .u.w.write = pm_ctrl_write, |
| 220 | + }, |
| 221 | + { |
| 222 | +diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c |
| 223 | +index 073b259747e9..e61279255a85 100644 |
| 224 | +--- a/drivers/xen/xen-pciback/pci_stub.c |
| 225 | ++++ b/drivers/xen/xen-pciback/pci_stub.c |
| 226 | +@@ -18,6 +18,11 @@ |
| 227 | + #include <linux/wait.h> |
| 228 | + #include <linux/sched.h> |
| 229 | + #include <linux/atomic.h> |
| 230 | ++#include <linux/device.h> |
| 231 | ++#include <linux/pci.h> |
| 232 | ++#include <linux/pm.h> |
| 233 | ++#include <linux/pm_runtime.h> |
| 234 | ++#include <linux/pm_wakeup.h> |
| 235 | + #include <xen/events.h> |
| 236 | + #include <xen/pci.h> |
| 237 | + #include <xen/xen.h> |
| 238 | +@@ -153,6 +158,7 @@ static void pcistub_device_release(struct kref *kref) |
| 239 | + |
| 240 | + kfree(dev_data); |
| 241 | + pci_set_drvdata(dev, NULL); |
| 242 | ++ atomic_add_unless(&dev->dev.power.usage_count, 1, 1); |
| 243 | + |
| 244 | + /* Clean-up the device */ |
| 245 | + xen_pcibk_config_free_dyn_fields(dev); |
| 246 | +@@ -494,6 +500,7 @@ static int pcistub_init_device(struct pcistub_device *psdev) |
| 247 | + xen_pcibk_reset_device(dev); |
| 248 | + |
| 249 | + pci_set_dev_assigned(dev); |
| 250 | ++ |
| 251 | + return 0; |
| 252 | + |
| 253 | + config_release: |
| 254 | +@@ -1042,33 +1049,30 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) |
| 255 | + return; |
| 256 | + } |
| 257 | + |
| 258 | +-static int xen_pcibk_suspend_noirq(struct device *dev) { |
| 259 | +- // Imitate pci_pm_suspend_noirq but with per-device opt-in and force |
| 260 | +- // option. |
| 261 | ++static int xen_pcibk_prepare(struct device *dev) { |
| 262 | ++ // Clear PME bit |
| 263 | + struct pci_dev *pci_dev = to_pci_dev(dev); |
| 264 | + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev); |
| 265 | ++ dev_data->pme_status = 0; |
| 266 | + |
| 267 | +- pci_save_state(pci_dev); |
| 268 | ++ return 1; |
| 269 | ++} |
| 270 | + |
| 271 | +- if (dev_data->pm_suspend) { |
| 272 | +- if (pci_dev->skip_bus_pm || !pci_power_manageable(pci_dev)) { |
| 273 | +- if (!dev_data->pm_suspend_force) { |
| 274 | +- pci_info(pci_dev, "Skipping device suspend\n"); |
| 275 | +- return 0; |
| 276 | +- } else { |
| 277 | +- pci_info(pci_dev, "Forcing device suspend\n"); |
| 278 | +- } |
| 279 | +- } |
| 280 | +- int err = pci_prepare_to_sleep(pci_dev); |
| 281 | +- if (err) { |
| 282 | +- pci_err(pci_dev, "Suspending device failed: %i\n", err); |
| 283 | +- } else { |
| 284 | +- pci_info(pci_dev, "Device suspended. It's now in %s\n", |
| 285 | +- pci_power_name(pci_dev->current_state)); |
| 286 | +- } |
| 287 | +- } else { |
| 288 | +- pci_info(pci_dev, "Backend-side device suspend not enabled\n"); |
| 289 | +- } |
| 290 | ++/* Prevent resuspending the device until the PME is handled by the guest */ |
| 291 | ++static int xen_pcibk_pm_runtime_idle(struct device *dev) |
| 292 | ++{ |
| 293 | ++ struct pci_dev *pci_dev = to_pci_dev(dev); |
| 294 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev); |
| 295 | ++ return dev_data->pme_status ? -EBUSY : 0; |
| 296 | ++} |
| 297 | ++ |
| 298 | ++static int xen_pcibk_pm_runtime_resume(struct device *dev) |
| 299 | ++{ |
| 300 | ++ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not |
| 301 | ++ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */ |
| 302 | ++ struct pci_dev *pci_dev = to_pci_dev(dev); |
| 303 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev); |
| 304 | ++ dev_data->pme_status = 1; |
| 305 | + |
| 306 | + return 0; |
| 307 | + } |
| 308 | +@@ -1082,7 +1086,9 @@ static const struct pci_error_handlers xen_pcibk_error_handler = { |
| 309 | + }; |
| 310 | + |
| 311 | + static const struct dev_pm_ops xen_pcibk_pm_ops = { |
| 312 | +- .suspend_noirq = xen_pcibk_suspend_noirq, |
| 313 | ++ .prepare = xen_pcibk_prepare, |
| 314 | ++ .runtime_idle = xen_pcibk_pm_runtime_idle, |
| 315 | ++ .runtime_resume = xen_pcibk_pm_runtime_resume, |
| 316 | + }; |
| 317 | + |
| 318 | + /* |
| 319 | +diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h |
| 320 | +index cf6df6964664..724f5f977231 100644 |
| 321 | +--- a/drivers/xen/xen-pciback/pciback.h |
| 322 | ++++ b/drivers/xen/xen-pciback/pciback.h |
| 323 | +@@ -56,6 +56,8 @@ struct xen_pcibk_dev_data { |
| 324 | + unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ |
| 325 | + unsigned int ack_intr:1; /* .. and ACK-ing */ |
| 326 | + unsigned long handled; |
| 327 | ++ unsigned int pme_enabled:1; |
| 328 | ++ unsigned int pme_status:1; |
| 329 | + unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ |
| 330 | + char irq_name[]; /* xen-pcibk[000:04:00.0] */ |
| 331 | + }; |
| 332 | +-- |
| 333 | +2.49.0 |
| 334 | + |
0 commit comments