Skip to content

Commit 823c708

Browse files
Add patch for runtime PM and PMEs
1 parent ece7129 commit 823c708

2 files changed

Lines changed: 335 additions & 0 deletions

File tree

kernel.spec.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch
151151
Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch
152152
Patch62: xen-pm-use-suspend.patch
153153
Patch63: xen-pciback-pm-suspend.patch
154+
Patch64: xen-pciback-pm-runtime.patch
154155

155156
%description
156157
Qubes Dom0 kernel.

xen-pciback-pm-runtime.patch

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
From 711b034c65bd0bec9c8c1c3862e61d7677e2e8fc Mon Sep 17 00:00:00 2001
2+
From: Vertex X7-53 <vertex@glassway.net>
3+
Date: Sun, 17 Aug 2025 01:34:00 +0100
4+
Subject: [PATCH] xen/pciback: Improve runtime power management
5+
6+
An important part of S0ix runtime power management is the control of PCI device D-states.
7+
Without both the device and any applicable PCI bridges in D3cold, the PMC will
8+
keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2.
9+
10+
The vast majority of devices depend on PME (Power Management Events) to
11+
wake from D3cold, so Linux will not attempt to put them into deeper
12+
sleep states if it detects the device does not support PME.
13+
PMEs can be delivered a variety of different ways, which include interrupts
14+
on the pcieport, ACPI events, and the setting of the PME status register in
15+
the PCI configuration space. Up until now, Xen has not supported the
16+
passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space.
17+
18+
This first patch is a modification to the dom0 kernel, specifically pciback.
19+
We enable support for runtime PM in pciback, to allow the dom0 kernel
20+
to suspend upstream bridges. Then we allow domains to read PME capability registers.
21+
When dom0 receives a PME, it forwards this to pciback, and pciback then sets
22+
a special emulated flag on the device. This flag is cleared by the guest when it
23+
resets the register to 0, after handling the event. We also respond to requests
24+
from the guest to change the power state and place pciback in a PM state
25+
in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports.
26+
---
27+
.../xen/xen-pciback/conf_space_capability.c | 131 ++++++++++++------
28+
drivers/xen/xen-pciback/pci_stub.c | 54 ++++----
29+
drivers/xen/xen-pciback/pciback.h | 2 +
30+
3 files changed, 120 insertions(+), 67 deletions(-)
31+
32+
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
33+
index cf568e899ee2..ce93964fefad 100644
34+
--- a/drivers/xen/xen-pciback/conf_space_capability.c
35+
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
36+
@@ -8,8 +8,12 @@
37+
38+
#include <linux/kernel.h>
39+
#include <linux/pci.h>
40+
+#include <linux/pm.h>
41+
+#include <linux/pm_runtime.h>
42+
+#include <linux/pm_wakeup.h>
43+
#include "pciback.h"
44+
#include "conf_space.h"
45+
+#include "../../pci/pci.h"
46+
47+
static LIST_HEAD(capabilities);
48+
struct xen_pcibk_config_capability {
49+
@@ -91,39 +95,108 @@ static const struct config_field caplist_vpd[] = {
50+
{}
51+
};
52+
53+
-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
54+
+static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value,
55+
void *data)
56+
{
57+
- int err;
58+
u16 real_value;
59+
60+
- err = pci_read_config_word(dev, offset, &real_value);
61+
- if (err)
62+
- goto out;
63+
+ pm_runtime_barrier(&dev->dev);
64+
65+
- *value = real_value & ~PCI_PM_CAP_PME_MASK;
66+
+ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
67+
68+
-out:
69+
- return err;
70+
+ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI.
71+
+ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command,
72+
+ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions.
73+
+ */
74+
+ pci_read_config_word(dev, offset, &real_value);
75+
+ if (PCI_POSSIBLE_ERROR(real_value))
76+
+ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */
77+
+ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET;
78+
+
79+
+ if (dev_data->pme_enabled)
80+
+ real_value |= PCI_PM_CTRL_PME_ENABLE;
81+
+ if (dev_data->pme_status)
82+
+ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE);
83+
+
84+
+ *value = real_value;
85+
+
86+
+ return 0;
87+
}
88+
89+
-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
90+
- * Can't allow driver domain to enable PMEs - they're shared */
91+
-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
92+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */
93+
+/* The guest doesn't actually get to write to the PME_ENABLE register, the host does this in pm suspend */
94+
+#define PM_OK_BITS PCI_PM_CTRL_DATA_SEL_MASK
95+
96+
static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
97+
void *data)
98+
{
99+
int err;
100+
+ int pm_err;
101+
u16 old_value;
102+
pci_power_t new_state;
103+
+ pci_power_t current_state;
104+
+
105+
+ pm_runtime_barrier(&dev->dev);
106+
+
107+
+ /* PME status is RW1CS */
108+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
109+
+ if (new_value & PCI_PM_CTRL_PME_STATUS) {
110+
+ dev_data->pme_status = 0;
111+
+ }
112+
+
113+
+ bool pme_request = new_value & PCI_PM_CTRL_PME_ENABLE;
114+
+ bool pme_changed = !!dev_data->pme_enabled != pme_request;
115+
+ if (pme_changed)
116+
+ dev_data->pme_enabled = pme_request;
117+
+
118+
+ if (pme_changed && pme_request) {
119+
+ dev_dbg(&dev->dev, "PME commanded to enabled\n");
120+
+ pm_runtime_put_noidle(&dev->dev);
121+
+ }
122+
+
123+
+ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
124+
+
125+
+ /* First, use pm ops to transition state */
126+
+ if (dev->current_state != new_state)
127+
+ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state);
128+
+
129+
+ bool runtime_pm = pm_runtime_enabled(&dev->dev);
130+
+ if (runtime_pm) {
131+
+ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) {
132+
+ pm_err = pm_runtime_resume(&dev->dev);
133+
+ if (pm_err < 0) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err);
134+
+ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) {
135+
+ pm_err = pm_runtime_suspend(&dev->dev);
136+
+ if (pm_err < 0) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err);
137+
+ }
138+
+ }
139+
+
140+
+ current_state = dev->current_state;
141+
+ if (current_state == PCI_D3cold)
142+
+ current_state = PCI_D3hot;
143+
+
144+
+ /* Otherwise, set it manually */
145+
+ if (!runtime_pm || current_state != new_state) {
146+
+ err = pci_set_power_state(dev, new_state);
147+
+ if (err) {
148+
+ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err);
149+
+ err = PCIBIOS_SET_FAILED;
150+
+ goto out;
151+
+ }
152+
+ }
153+
154+
+ if (pme_changed && !pme_request) {
155+
+ /* Prevent ACPI from enabling suspend during runtime PM logic
156+
+ * If we disable runtime power management, we must also wake up the device */
157+
+ dev_dbg(&dev->dev, "PME commanded to disabled\n");
158+
+ pm_runtime_resume_and_get(&dev->dev);
159+
+ }
160+
+
161+
+ /* This must happen here, after pm_runtime_resume is called */
162+
err = pci_read_config_word(dev, offset, &old_value);
163+
if (err)
164+
goto out;
165+
166+
- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
167+
-
168+
new_value &= PM_OK_BITS;
169+
if ((old_value & PM_OK_BITS) != new_value) {
170+
new_value = (old_value & ~PM_OK_BITS) | new_value;
171+
@@ -132,48 +205,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
172+
goto out;
173+
}
174+
175+
- /* Let pci core handle the power management change */
176+
- dev_dbg(&dev->dev, "set power state to %x\n", new_state);
177+
- err = pci_set_power_state(dev, new_state);
178+
- if (err) {
179+
- err = PCIBIOS_SET_FAILED;
180+
- goto out;
181+
- }
182+
-
183+
out:
184+
return err;
185+
}
186+
187+
-/* Ensure PMEs are disabled */
188+
-static void *pm_ctrl_init(struct pci_dev *dev, int offset)
189+
-{
190+
- int err;
191+
- u16 value;
192+
-
193+
- err = pci_read_config_word(dev, offset, &value);
194+
- if (err)
195+
- goto out;
196+
-
197+
- if (value & PCI_PM_CTRL_PME_ENABLE) {
198+
- value &= ~PCI_PM_CTRL_PME_ENABLE;
199+
- err = pci_write_config_word(dev, offset, value);
200+
- }
201+
-
202+
-out:
203+
- return err ? ERR_PTR(err) : NULL;
204+
-}
205+
-
206+
static const struct config_field caplist_pm[] = {
207+
{
208+
.offset = PCI_PM_PMC,
209+
.size = 2,
210+
- .u.w.read = pm_caps_read,
211+
+ .u.w.read = xen_pcibk_read_config_word,
212+
},
213+
{
214+
.offset = PCI_PM_CTRL,
215+
.size = 2,
216+
- .init = pm_ctrl_init,
217+
- .u.w.read = xen_pcibk_read_config_word,
218+
+ .u.w.read = pm_ctrl_read,
219+
.u.w.write = pm_ctrl_write,
220+
},
221+
{
222+
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
223+
index 073b259747e9..e61279255a85 100644
224+
--- a/drivers/xen/xen-pciback/pci_stub.c
225+
+++ b/drivers/xen/xen-pciback/pci_stub.c
226+
@@ -18,6 +18,11 @@
227+
#include <linux/wait.h>
228+
#include <linux/sched.h>
229+
#include <linux/atomic.h>
230+
+#include <linux/device.h>
231+
+#include <linux/pci.h>
232+
+#include <linux/pm.h>
233+
+#include <linux/pm_runtime.h>
234+
+#include <linux/pm_wakeup.h>
235+
#include <xen/events.h>
236+
#include <xen/pci.h>
237+
#include <xen/xen.h>
238+
@@ -153,6 +158,7 @@ static void pcistub_device_release(struct kref *kref)
239+
240+
kfree(dev_data);
241+
pci_set_drvdata(dev, NULL);
242+
+ atomic_add_unless(&dev->dev.power.usage_count, 1, 1);
243+
244+
/* Clean-up the device */
245+
xen_pcibk_config_free_dyn_fields(dev);
246+
@@ -494,6 +500,7 @@ static int pcistub_init_device(struct pcistub_device *psdev)
247+
xen_pcibk_reset_device(dev);
248+
249+
pci_set_dev_assigned(dev);
250+
+
251+
return 0;
252+
253+
config_release:
254+
@@ -1042,33 +1049,30 @@ static void xen_pcibk_error_resume(struct pci_dev *dev)
255+
return;
256+
}
257+
258+
-static int xen_pcibk_suspend_noirq(struct device *dev) {
259+
- // Imitate pci_pm_suspend_noirq but with per-device opt-in and force
260+
- // option.
261+
+static int xen_pcibk_prepare(struct device *dev) {
262+
+ // Clear PME bit
263+
struct pci_dev *pci_dev = to_pci_dev(dev);
264+
struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
265+
+ dev_data->pme_status = 0;
266+
267+
- pci_save_state(pci_dev);
268+
+ return 1;
269+
+}
270+
271+
- if (dev_data->pm_suspend) {
272+
- if (pci_dev->skip_bus_pm || !pci_power_manageable(pci_dev)) {
273+
- if (!dev_data->pm_suspend_force) {
274+
- pci_info(pci_dev, "Skipping device suspend\n");
275+
- return 0;
276+
- } else {
277+
- pci_info(pci_dev, "Forcing device suspend\n");
278+
- }
279+
- }
280+
- int err = pci_prepare_to_sleep(pci_dev);
281+
- if (err) {
282+
- pci_err(pci_dev, "Suspending device failed: %i\n", err);
283+
- } else {
284+
- pci_info(pci_dev, "Device suspended. It's now in %s\n",
285+
- pci_power_name(pci_dev->current_state));
286+
- }
287+
- } else {
288+
- pci_info(pci_dev, "Backend-side device suspend not enabled\n");
289+
- }
290+
+/* Prevent resuspending the device until the PME is handled by the guest */
291+
+static int xen_pcibk_pm_runtime_idle(struct device *dev)
292+
+{
293+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
294+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
295+
+ return dev_data->pme_status ? -EBUSY : 0;
296+
+}
297+
+
298+
+static int xen_pcibk_pm_runtime_resume(struct device *dev)
299+
+{
300+
+ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not
301+
+ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */
302+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
303+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
304+
+ dev_data->pme_status = 1;
305+
306+
return 0;
307+
}
308+
@@ -1082,7 +1086,9 @@ static const struct pci_error_handlers xen_pcibk_error_handler = {
309+
};
310+
311+
static const struct dev_pm_ops xen_pcibk_pm_ops = {
312+
- .suspend_noirq = xen_pcibk_suspend_noirq,
313+
+ .prepare = xen_pcibk_prepare,
314+
+ .runtime_idle = xen_pcibk_pm_runtime_idle,
315+
+ .runtime_resume = xen_pcibk_pm_runtime_resume,
316+
};
317+
318+
/*
319+
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
320+
index cf6df6964664..724f5f977231 100644
321+
--- a/drivers/xen/xen-pciback/pciback.h
322+
+++ b/drivers/xen/xen-pciback/pciback.h
323+
@@ -56,6 +56,8 @@ struct xen_pcibk_dev_data {
324+
unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
325+
unsigned int ack_intr:1; /* .. and ACK-ing */
326+
unsigned long handled;
327+
+ unsigned int pme_enabled:1;
328+
+ unsigned int pme_status:1;
329+
unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
330+
char irq_name[]; /* xen-pcibk[000:04:00.0] */
331+
};
332+
--
333+
2.49.0
334+

0 commit comments

Comments
 (0)