From 38c6eb3eed5281d6a68e0ef4a154d9394fa4a60d Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: [PATCH 01/51] crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson (cherry picked from commit 4868d2d52df6f724b01531843805a3b1322e2dd9) Signed-off-by: Jiandi An --- drivers/crypto/hisilicon/qm.c | 27 +++++++++++++++++++++++++++ include/linux/hisi_acc_qm.h | 3 +++ 2 files changed, 30 insertions(+) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index dae2e4c36e53a..cba2146b357d6 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3005,11 +3005,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5696,6 +5721,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5834,6 +5860,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 0c4c84b8c3be9..182fd7e70f92a 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ From 0c7d3823241025d101fefb114cbe8df080bc289d Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:44 +0800 Subject: [PATCH 02/51] hisi_acc_vfio_pci: adapt to new migration configuration On new platforms greater than QM_HW_V3, the migration region has been relocated from the VF to the PF. The VF's own configuration space is restored to the complete 64KB, and there is no need to divide the size of the BAR configuration space equally. The driver should be modified accordingly to adapt to the new hardware device. On the older hardware platform QM_HW_V3, the live migration configuration region is placed in the latter 32K portion of the VF's BAR2 configuration space. On the new hardware platform QM_HW_V4, the live migration configuration region also exists in the same 32K area immediately following the VF's BAR2, just like on QM_HW_V3. However, access to this region is now controlled by hardware. Additionally, a copy of the live migration configuration region is present in the PF's BAR2 configuration space. On the new hardware platform QM_HW_V4, when an older version of the driver is loaded, it behaves like QM_HW_V3 and uses the configuration region in the VF, ensuring that the live migration function continues to work normally. When the new version of the driver is loaded, it directly uses the configuration region in the PF. Meanwhile, hardware configuration disables the live migration configuration region in the VF's BAR2: reads return all 0xF values, and writes are silently ignored. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20251030015744.131771-3-liulongfang@huawei.com Signed-off-by: Alex Williamson (cherry picked from commit 2131c1517f3004da208b7f1a3b06b8119172e194) Signed-off-by: Jiandi An --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 130 +++++++++++++----- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 23 +++- 2 files changed, 114 insertions(+), 39 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99ec..498cb7d1c9e50 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1330,7 +1391,6 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; unsigned long minsz; @@ -1345,12 +1405,7 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info.size = hisi_acc_get_resource_len(vdev, info.index); info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | @@ -1521,7 +1576,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc18..cd55eba64dfb2 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating From 449e051b54c225d39dbe2a19ba4d94604991f452 Mon Sep 17 00:00:00 2001 From: Morduan Zang Date: Thu, 14 Aug 2025 19:03:58 +0800 Subject: [PATCH 03/51] vfio/nvgrace-gpu: fix grammatical error The word "as" in the comment should be replaced with "is", and there is an extra space in the comment. Signed-off-by: Morduan Zang Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/54E1ED6C5A2682C8+20250814110358.285412-1-zhangdandan@uniontech.com Signed-off-by: Alex Williamson (cherry picked from commit 767b1ed8b980498978c77dc89497602ae3421af5) Signed-off-by: Jiandi An --- drivers/vfio/pci/nvgrace-gpu/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a9c54ff8498ab..17a64b836da48 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -544,7 +544,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); /* * The region memory size may not be power-of-2 aligned. - * Given that the memory as a BAR and may not be + * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ info.size = memregion->bar_size; From 897cefa739f7d142c3166a064e2edf06dcbe1d34 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: [PATCH 04/51] vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 113557b0406818a8a5df3479b0a89125d2b2a04c) Signed-off-by: Jiandi An --- drivers/vfio/pci/vfio_pci_core.c | 9 ++++++--- drivers/vfio/vfio_main.c | 7 +++++++ include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9339920793a7c..41a0beab4eacf 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1024,9 +1024,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; @@ -1160,6 +1162,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1486,7 +1489,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); + return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 715368076a1fe..9a9decea5c992 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1295,7 +1295,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + if (!device->ops->get_region_info) + goto ioctl_fallback; + ret = device->ops->get_region_info(device, uptr); + break; + default: +ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee5..be5fcf8432e8d 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 46ab6075ab75a..00a007eedbe85 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -128,6 +128,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, From 6b97c1b33bef36145a5d380a00194020fdbe639c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:18 -0400 Subject: [PATCH 05/51] vfio/hisi: Convert to the get_region_info op Change the function signature of hisi_acc_vfio_pci_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (backported from commit e238f147d517681e5dcb3dc9ab149211d66b00a1) [jan: resolve minor conflict in hisi_acc_vfio_pci_ioctl()] Signed-off-by: Jiandi An --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 45 +++++++++---------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 498cb7d1c9e50..590aa47dce759 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1385,37 +1385,33 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct vfio_region_info info; + unsigned long minsz; - minsz = offsetofend(struct vfio_region_info, offset); + minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info.index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, arg); - info.size = hisi_acc_get_resource_len(vdev, info.index); + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info.size = hisi_acc_get_resource_len(vdev, info.index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1613,7 +1609,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, From fad0d0d38ca4235e8ed94052e7af576600b922db Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:19 -0400 Subject: [PATCH 06/51] vfio/virtio: Convert to the get_region_info op Remove virtiovf_vfio_pci_core_ioctl() and change the signature of virtiovf_pci_ioctl_get_region_info(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit c044eefa47864fb436254cb330e8d90cb6a3a870) Signed-off-by: Jiandi An --- drivers/vfio/pci/virtio/common.h | 4 +--- drivers/vfio/pci/virtio/legacy_io.c | 20 ++++---------------- drivers/vfio/pci/virtio/main.c | 3 ++- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386e..a10f2d92cb623 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info __user *arg); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c4..d735d5c4bd777 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,15 +281,14 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info __user *arg) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -301,20 +300,9 @@ int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, info.size = virtvdev->bar0_virtual_buf_size; info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); } } diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f7..92b525e52abe4 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -108,7 +108,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, From 702622746ce417ccf5d724f716ae2a0ef854e14a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:20 -0400 Subject: [PATCH 07/51] vfio/nvgrace: Convert to the get_region_info op Change the signature of nvgrace_gpu_ioctl_get_region_info() Reviewed-by: Kevin Tian Reviewed-by: Ankit Agrawal Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 5ac7206474777dff56f79f1d6bc9973e988f7587) Signed-off-by: Jiandi An --- drivers/vfio/pci/nvgrace-gpu/main.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 17a64b836da48..94bb167c99648 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -489,9 +489,9 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static long +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) + struct vfio_region_info __user *arg) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, @@ -504,7 +504,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -516,8 +516,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, */ memregion = nvgrace_gpu_memregion(info.index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, arg); size = struct_size(sparse, areas, 1); @@ -569,16 +568,13 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -1002,6 +998,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, From e54b8e086acd0987073d7cfff2e4cf215fbaed1a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:21 -0400 Subject: [PATCH 08/51] vfio/pci: Fill in the missing get_region_info ops Now that every variant driver provides a get_region_info op remove the ioctl based dispatch from vfio_pci_core_ioctl(). Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit f3fddb71dd50ed31ae474238551e8623a1bc16db) Signed-off-by: Jiandi An --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 1 + drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 2 -- drivers/vfio/pci/virtio/main.c | 2 ++ 8 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 590aa47dce759..6565b6d57fece 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1630,6 +1630,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e5..b7f941f8047ea 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 94bb167c99648..77efa99597cfb 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -1019,6 +1019,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f675..1946bc75d99b4 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2e..8452d9c1d11d3 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2f..2d9122efc10ba 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 41a0beab4eacf..fe031c53cc25d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1488,8 +1488,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 92b525e52abe4..d68096bc52521 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -131,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, From 4df20815cb64d0ec749cd1af4fc214e2784e987a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:22 -0400 Subject: [PATCH 09/51] vfio/mtty: Provide a get_region_info op Move it out of mtty_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 0787755271096e6c48019f44aea6ccc33f93bf41) Signed-off-by: Jiandi An --- samples/vfio-mdev/mtty.c | 53 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 59eefe2fed10c..b27f9b93471bc 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1785,6 +1785,34 @@ static int mtty_get_device_info(struct vfio_device_info *dev_info) return 0; } +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info info; + void *cap_type = NULL; + u16 cap_type_id = 0; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); + if (ret) + return ret; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1817,30 +1845,6 @@ static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1949,6 +1953,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, + .get_region_info = mtty_ioctl_get_region_info, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, From 0fbfd736592c64da19d694889f43a36993c53f9e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:23 -0400 Subject: [PATCH 10/51] vfio/mdpy: Provide a get_region_info op Move it out of mdpy_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit cf16acc0af09a12d46e972f42186d94931325ea8) Signed-off-by: Jiandi An --- samples/vfio-mdev/mdpy.c | 53 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 8104831ae125b..0c65ed2217386 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -512,6 +512,34 @@ static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, return 0; } +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info info; + void *cap_type = NULL; + u16 cap_type_id = 0; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); + if (ret) + return ret; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -544,30 +572,6 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -665,6 +669,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, + .get_region_info = mdpy_ioctl_get_region_info, .mmap = mdpy_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 554dca9a1de1a5c7d25d2732885d142fe898dead Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:24 -0400 Subject: [PATCH 11/51] vfio/mbochs: Provide a get_region_info op Move it out of mbochs_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 8339fccda83772ff1b035f848d1f992410c61c57) Signed-off-by: Jiandi An --- samples/vfio-mdev/mbochs.c | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 18623ba666e33..7f889b31fa2ce 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1185,13 +1185,42 @@ static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) return dma_buf_fd(dmabuf->buf, 0); } +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct vfio_region_info_ext info; + unsigned long minsz, outsz; + int ret; + + minsz = offsetofend(typeof(info), base.offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + outsz = info.base.argsz; + if (outsz < minsz) + return -EINVAL; + if (outsz > sizeof(info)) + return -EINVAL; + + ret = mbochs_get_region_info(mdev_state, &info); + if (ret) + return ret; + + if (copy_to_user(arg, &info, outsz)) + return -EFAULT; + return 0; +} + static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); int ret = 0; - unsigned long minsz, outsz; + unsigned long minsz; switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -1215,30 +1244,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info_ext info; - - minsz = offsetofend(typeof(info), base.offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - outsz = info.base.argsz; - if (outsz < minsz) - return -EINVAL; - if (outsz > sizeof(info)) - return -EINVAL; - - ret = mbochs_get_region_info(mdev_state, &info); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, outsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1376,6 +1381,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, + .get_region_info = mbochs_ioctl_get_region_info, .mmap = mbochs_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 073f13c17982c7e78d329e61621e17b7ef7b1ef1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:25 -0400 Subject: [PATCH 12/51] vfio/platform: Provide a get_region_info op Move it out of vfio_platform_ioctl() and re-indent it. Add it to all platform drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Reviewed-by: Eric Auger Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/9-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit d4635df279f57de1dd301b864724a930551028b2) Signed-off-by: Jiandi An --- drivers/vfio/platform/vfio_amba.c | 1 + drivers/vfio/platform/vfio_platform.c | 1 + drivers/vfio/platform/vfio_platform_common.c | 50 +++++++++++-------- drivers/vfio/platform/vfio_platform_private.h | 2 + 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index ff8ff8480968c..a234ac292be3b 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -113,6 +113,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7f..0e85c914b6510 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd7957..3ebd50fb78fbb 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,34 @@ int vfio_platform_open_device(struct vfio_device *core_vdev) } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +328,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab5168490..a6008320e77ba 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,8 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); From 8ba94bf6a94e31769c0d5c71a7c238b968eb7d33 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:26 -0400 Subject: [PATCH 13/51] vfio/fsl: Provide a get_region_info op Move it out of vfio_fsl_mc_ioctl() and re-indent it. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/10-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 6cdae5d0c326f0ec8f0f50571df0374f2f0b8815) Signed-off-by: Jiandi An --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 56 ++++++++++++++++++------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index f65d91c01f2ec..718c938f63a02 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,37 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int +vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + struct vfio_region_info info; + unsigned long minsz; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + if (copy_to_user(arg, &info, minsz)) + return -EFAULT; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +180,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -587,6 +594,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, From 619333df0ce80e4334261826420bc8cdd73975e9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:27 -0400 Subject: [PATCH 14/51] vfio/cdx: Provide a get_region_info op Change the signature of vfio_cdx_ioctl_get_region_info() and hook it to the op. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit b9827eff6b4aab8203a1aa720ae3a6d3731f1a4d) Signed-off-by: Jiandi An --- drivers/vfio/cdx/main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad76865..506d849139d35 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,9 +129,11 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, struct vfio_region_info __user *arg) { + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); struct vfio_region_info info; @@ -219,8 +221,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +284,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, From 76b5171d117de8a2778210b5395dc30e2b124825 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:28 -0400 Subject: [PATCH 15/51] vfio/ccw: Provide a get_region_info op Move it out of vfio_ccw_mdev_ioctl() and re-indent it. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/12-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 61b3f7b5a72905e45f5d7769472edea8d80b56ef) Signed-off-by: Jiandi An --- drivers/s390/cio/vfio_ccw_ops.c | 44 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index ea532a8a4a0c2..6d46e0bc76df1 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -504,6 +504,31 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) private->region = NULL; } +static int +vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info __user *arg) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + struct vfio_region_info info; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + ret = vfio_ccw_mdev_get_region_info(private, &info, arg); + if (ret) + return ret; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) @@ -532,24 +557,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -627,6 +634,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, + .get_region_info = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, From 6c250ce18f9e27527b0badd93bee52c74668e94c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:29 -0400 Subject: [PATCH 16/51] vfio/gvt: Provide a get_region_info op Move it out of intel_vgpu_ioctl() and re-indent it. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/13-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit e664067b603570164794facf06686434a12cb74d) Signed-off-by: Jiandi An --- drivers/gpu/drm/i915/gvt/kvmgt.c | 295 +++++++++++++++---------------- 1 file changed, 147 insertions(+), 148 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 69830a5c49d3f..1feb2a28ca5fd 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1140,185 +1140,183 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, return func(vgpu, index, start, count, flags, data); } -static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, - unsigned long arg) +static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, + struct vfio_region_info __user *arg) { + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + struct vfio_region_info info; unsigned long minsz; + int nr_areas = 1; + int cap_type_id; + unsigned int i; + int ret; - gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd); - - if (cmd == VFIO_DEVICE_GET_INFO) { - struct vfio_device_info info; + minsz = offsetofend(struct vfio_region_info, offset); - minsz = offsetofend(struct vfio_device_info, num_irqs); + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; - if (info.argsz < minsz) - return -EINVAL; + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->gvt->device_info.cfg_space_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->cfg_space.bar[info.index].size; + if (!info.size) { + info.flags = 0; + break; + } - info.flags = VFIO_DEVICE_FLAGS_PCI; - info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS + - vgpu->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; + break; + case VFIO_PCI_BAR2_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info.size = gvt_aperture_sz(vgpu->gvt); + + sparse = kzalloc(struct_size(sparse, areas, nr_areas), + GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->areas[0].offset = + PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].size = vgpu_aperture_sz(vgpu); + break; - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned int i; - int ret; - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - int nr_areas = 1; - int cap_type_id; + gvt_dbg_core("get region info bar:%d\n", info.index); + break; - minsz = offsetofend(struct vfio_region_info, offset); + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0; + info.flags = 0; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + gvt_dbg_core("get region info index:%d\n", info.index); + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; - if (info.argsz < minsz) + if (info.index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) return -EINVAL; + info.index = array_index_nospec( + info.index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - break; - case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); - - sparse = kzalloc(struct_size(sparse, areas, nr_areas), - GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); - sparse->areas[0].size = vgpu_aperture_sz(vgpu); - break; + i = info.index - VFIO_PCI_NUM_REGIONS; - case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vgpu->region[i].size; + info.flags = vgpu->region[i].flags; - gvt_dbg_core("get region info bar:%d\n", info.index); - break; + cap_type.type = vgpu->region[i].type; + cap_type.subtype = vgpu->region[i].subtype; - case VFIO_PCI_ROM_REGION_INDEX: - case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + } + } - gvt_dbg_core("get region info index:%d\n", info.index); + if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = vfio_info_add_capability( + &caps, &sparse->header, + struct_size(sparse, areas, sparse->nr_areas)); + if (ret) { + kfree(sparse); + return ret; + } break; default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= VFIO_PCI_NUM_REGIONS + - vgpu->num_regions) - return -EINVAL; - info.index = - array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vgpu->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = - VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; - - cap_type.type = vgpu->region[i].type; - cap_type.subtype = vgpu->region[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } + kfree(sparse); + return -EINVAL; } + } - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { - switch (cap_type_id) { - case VFIO_REGION_INFO_CAP_SPARSE_MMAP: - ret = vfio_info_add_capability(&caps, - &sparse->header, - struct_size(sparse, areas, - sparse->nr_areas)); - if (ret) { - kfree(sparse); - return ret; - } - break; - default: + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); kfree(sparse); - return -EINVAL; + return -EFAULT; } + info.cap_offset = sizeof(info); } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } + kfree(caps.buf); + } - kfree(caps.buf); - } + kfree(sparse); + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, + unsigned long arg) +{ + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + unsigned long minsz; + + gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd); + + if (cmd == VFIO_DEVICE_GET_INFO) { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_PCI; + info.flags |= VFIO_DEVICE_FLAGS_RESET; + info.num_regions = VFIO_PCI_NUM_REGIONS + + vgpu->num_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS; - kfree(sparse); return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -1475,6 +1473,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, + .get_region_info = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From e7da10685f7ff06501fbc2d1ff50094a5638c76d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:30 -0400 Subject: [PATCH 17/51] vfio: Require drivers to implement get_region_info Remove the fallback through the ioctl callback, no drivers use this now. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/14-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit f97859503859fd217d27fc4496f294955e197eaf) Signed-off-by: Jiandi An --- drivers/vfio/vfio_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9a9decea5c992..42c082a6af57f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1296,13 +1296,13 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (!device->ops->get_region_info) - goto ioctl_fallback; - ret = device->ops->get_region_info(device, uptr); + if (unlikely(!device->ops->get_region_info)) + ret = -EINVAL; + else + ret = device->ops->get_region_info(device, uptr); break; default: -ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else From 7dd77b841190ab30eee55b5b4f4bdb881c1dffc9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: [PATCH 18/51] vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd) Signed-off-by: Jiandi An --- drivers/vfio/vfio_main.c | 56 +++++++++++++++++++++++++++++++++++++--- include/linux/vfio.h | 4 +++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 42c082a6af57f..5aa229a3dee5d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1258,6 +1258,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + if (device->ops->get_region_info_caps) { + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, + caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)) { + ret = -EFAULT; + goto out_free; + } + } else if (device->ops->get_region_info) { + ret = device->ops->get_region_info(device, arg); + if (ret) + return ret; + } else { + return -EINVAL; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1296,10 +1347,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (unlikely(!device->ops->get_region_info)) - ret = -EINVAL; - else - ret = device->ops->get_region_info(device, uptr); + ret = vfio_get_region_info(device, uptr); break; default: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8d..6311ddc837701 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); From 29e1217fd909584af23617e290e92788c0383743 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:32 -0400 Subject: [PATCH 19/51] vfio/mbochs: Convert mbochs to use vfio_info_add_capability() This driver open codes the cap chain manipulations. Instead use vfio_info_add_capability() and the get_region_info_caps() op. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/16-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 45f9fa18109d9dece3604bbbc810d526e424b536) Signed-off-by: Jiandi An --- samples/vfio-mdev/mbochs.c | 75 ++++++++++++-------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 7f889b31fa2ce..64ea19253ee3a 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -143,11 +143,6 @@ static struct mdev_parent mbochs_parent; static atomic_t mbochs_avail_mbytes; static const struct vfio_device_ops mbochs_dev_ops; -struct vfio_region_info_ext { - struct vfio_region_info base; - struct vfio_region_info_cap_type type; -}; - struct mbochs_mode { u32 drm_format; u32 bytepp; @@ -1033,10 +1028,12 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) return 0; } -static int mbochs_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info_ext *ext) +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { - struct vfio_region_info *region_info = &ext->base; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); if (region_info->index >= MBOCHS_NUM_REGIONS) return -EINVAL; @@ -1061,20 +1058,23 @@ static int mbochs_get_region_info(struct mdev_state *mdev_state, region_info->flags = (VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE); break; - case MBOCHS_EDID_REGION_INDEX: - ext->base.argsz = sizeof(*ext); - ext->base.offset = MBOCHS_EDID_OFFSET; - ext->base.size = MBOCHS_EDID_SIZE; - ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_CAPS); - ext->base.cap_offset = offsetof(typeof(*ext), type); - ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; - ext->type.header.version = 1; - ext->type.header.next = 0; - ext->type.type = VFIO_REGION_TYPE_GFX; - ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; - break; + case MBOCHS_EDID_REGION_INDEX: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1, + .type = VFIO_REGION_TYPE_GFX, + .subtype = VFIO_REGION_SUBTYPE_GFX_EDID, + }; + + region_info->offset = MBOCHS_EDID_OFFSET; + region_info->size = MBOCHS_EDID_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + + return vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + } default: region_info->size = 0; region_info->offset = 0; @@ -1185,35 +1185,6 @@ static int mbochs_get_gfx_dmabuf(struct mdev_state *mdev_state, u32 id) return dma_buf_fd(dmabuf->buf, 0); } -static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info_ext info; - unsigned long minsz, outsz; - int ret; - - minsz = offsetofend(typeof(info), base.offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - outsz = info.base.argsz; - if (outsz < minsz) - return -EINVAL; - if (outsz > sizeof(info)) - return -EINVAL; - - ret = mbochs_get_region_info(mdev_state, &info); - if (ret) - return ret; - - if (copy_to_user(arg, &info, outsz)) - return -EFAULT; - return 0; -} - static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1381,7 +1352,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, - .get_region_info = mbochs_ioctl_get_region_info, + .get_region_info_caps = mbochs_ioctl_get_region_info, .mmap = mbochs_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From 0282af066b106fa7c63bc0d9181b8404da6dfd85 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:33 -0400 Subject: [PATCH 20/51] vfio/gvt: Convert to get_region_info_caps Remove the duplicate code and change info to a pointer. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/17-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 93165757c023940dc5e32a3b64a750fb6767bd8a) Signed-off-by: Jiandi An --- drivers/gpu/drm/i915/gvt/kvmgt.c | 113 ++++++++++++------------------- 1 file changed, 42 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 1feb2a28ca5fd..96d23717684f7 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1141,56 +1141,46 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, } static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); - struct vfio_region_info info; - unsigned long minsz; int nr_areas = 1; int cap_type_id; unsigned int i; int ret; - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->gvt->device_info.cfg_space_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->cfg_space.bar[info->index].size; + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; break; case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info->size = gvt_aperture_sz(vgpu->gvt); sparse = kzalloc(struct_size(sparse, areas, nr_areas), GFP_KERNEL); @@ -1207,20 +1197,20 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, break; case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; - gvt_dbg_core("get region info bar:%d\n", info.index); + gvt_dbg_core("get region info bar:%d\n", info->index); break; case VFIO_PCI_ROM_REGION_INDEX: case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; - gvt_dbg_core("get region info index:%d\n", info.index); + gvt_dbg_core("get region info index:%d\n", info->index); break; default: { struct vfio_region_info_cap_type cap_type = { @@ -1228,32 +1218,32 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->region[i].size; + info->flags = vgpu->region[i].flags; cap_type.type = vgpu->region[i].type; cap_type.subtype = vgpu->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; } } - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { switch (cap_type_id) { case VFIO_REGION_INFO_CAP_SPARSE_MMAP: ret = vfio_info_add_capability( - &caps, &sparse->header, + caps, &sparse->header, struct_size(sparse, areas, sparse->nr_areas)); if (ret) { kfree(sparse); @@ -1266,27 +1256,8 @@ static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, } } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - kfree(sparse); - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, @@ -1473,7 +1444,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, - .get_region_info = intel_vgpu_ioctl_get_region_info, + .get_region_info_caps = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, From bc1c993e783d2b5c8c135ffdfee0ea191370c8d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:34 -0400 Subject: [PATCH 21/51] vfio/ccw: Convert to get_region_info_caps Remove the duplicate code and flatten the call chain. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/18-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 973af0c40eaf5cf201d22f1cf6b50587d0138703) Signed-off-by: Jiandi An --- drivers/s390/cio/vfio_ccw_ops.c | 55 +++++---------------------------- 1 file changed, 7 insertions(+), 48 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6d46e0bc76df1..a596f6013019c 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, - struct vfio_region_info *info, - unsigned long arg) +static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); int i; switch (info->index) { @@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, return 0; default: /* all other regions are handled via capability chain */ { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_type cap_type = { .header.id = VFIO_REGION_INFO_CAP_TYPE, .header.version = 1 }; @@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, cap_type.type = private->region[i].type; cap_type.subtype = private->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; - - info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info->argsz < sizeof(*info) + caps.size) { - info->argsz = sizeof(*info) + caps.size; - info->cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(*info)); - if (copy_to_user((void __user *)arg + sizeof(*info), - caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info->cap_offset = sizeof(*info); - } - - kfree(caps.buf); - } } return 0; @@ -504,31 +488,6 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) private->region = NULL; } -static int -vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct vfio_ccw_private *private = - container_of(vdev, struct vfio_ccw_private, vdev); - struct vfio_region_info info; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; -} - static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) @@ -634,7 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, - .get_region_info = vfio_ccw_mdev_ioctl_get_region_info, + .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, From 2bf5a2cbb154e6f1d354811acfcd213f6dce38fa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: [PATCH 22/51] vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1) Signed-off-by: Jiandi An --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 30 ++--- drivers/vfio/pci/mlx5/main.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 51 ++------- drivers/vfio/pci/pds/vfio_dev.c | 2 +- drivers/vfio/pci/qat/main.c | 2 +- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 103 +++++++----------- drivers/vfio/pci/virtio/common.h | 3 +- drivers/vfio/pci/virtio/legacy_io.c | 26 ++--- drivers/vfio/pci/virtio/main.c | 6 +- include/linux/vfio_pci_core.h | 3 +- 11 files changed, 80 insertions(+), 150 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 6565b6d57fece..6f880c247d61c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, } static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - if (info.index != VFIO_PCI_BAR2_REGION_INDEX) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + info->size = hisi_acc_get_resource_len(vdev, info->index); - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - - info.size = hisi_acc_get_resource_len(vdev, info.index); - - info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = hisi_acc_vfio_ioctl_get_region, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1630,7 +1620,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b7f941f8047ea..9c5970411d07a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 77efa99597cfb..69dd32632ee6f 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -489,34 +489,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static int -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -535,40 +526,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, @@ -998,7 +971,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, - .get_region_info = nvgrace_gpu_ioctl_get_region_info, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -1019,7 +992,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 1946bc75d99b4..be103c74e9695 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 8452d9c1d11d3..8fbdf7c6d666e 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d9122efc10ba..a3e49d42c771b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index fe031c53cc25d..59866f5436b46 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1025,43 +1025,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, } int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1073,9 +1065,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1085,16 +1077,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1103,10 +1096,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1115,52 +1108,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index a10f2d92cb623..cb3d5e57d3a3e 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index d735d5c4bd777..1ed349a556291 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_region_info info = {}; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d68096bc52521..d2e5cbca13c85 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = virtiovf_pci_ioctl_get_region_info, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 00a007eedbe85..2a9c289ef5f29 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -129,7 +129,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, From c0ad388ba74196510f22ab75ddcd0d0e26239751 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:36 -0400 Subject: [PATCH 23/51] vfio/platform: Convert to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Reviewed-by: Mostafa Saleh Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 182c62861ba5a2301c7178484d2f424aaae4283b) Signed-off-by: Jiandi An --- drivers/vfio/platform/vfio_amba.c | 2 +- drivers/vfio/platform/vfio_platform.c | 2 +- drivers/vfio/platform/vfio_platform_common.c | 24 ++++++------------- drivers/vfio/platform/vfio_platform_private.h | 3 ++- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index a234ac292be3b..a28be7d11ce03 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -113,7 +113,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 0e85c914b6510..a4d3ace3e02dd 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,7 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, - .get_region_info = vfio_platform_ioctl_get_region_info, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3ebd50fb78fbb..c2990b7e900fa 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -273,30 +273,20 @@ int vfio_platform_open_device(struct vfio_device *core_vdev) EXPORT_SYMBOL_GPL(vfio_platform_open_device); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) + if (info->index >= vdev->num_regions) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index a6008320e77ba..05084212a76eb 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -86,7 +86,8 @@ void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); From 21085759fbcd63b0cc97bd1c3b6301d037d6fa41 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:37 -0400 Subject: [PATCH 24/51] vfio: Move the remaining drivers to get_region_info_caps Remove the duplicate code and change info to a pointer. caps are not used. Reviewed-by: Kevin Tian Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/21-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit dc10734610e29bb6464498e933fc0d622227153d) Signed-off-by: Jiandi An --- drivers/vfio/cdx/main.c | 24 +++++++------------ drivers/vfio/fsl-mc/vfio_fsl_mc.c | 29 +++++++---------------- samples/vfio-mdev/mdpy.c | 39 ++++++------------------------- samples/vfio-mdev/mtty.c | 38 +++++------------------------- 4 files changed, 29 insertions(+), 101 deletions(-) diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 506d849139d35..253031b86b60a 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -130,29 +130,21 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, } static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_cdx_device *vdev = container_of(core_vdev, struct vfio_cdx_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -284,7 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, - .get_region_info = vfio_cdx_ioctl_get_region_info, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 718c938f63a02..3985613e6830b 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,34 +117,21 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } -static int -vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_fsl_mc_device *vdev = container_of(core_vdev, struct vfio_fsl_mc_device, vdev); struct fsl_mc_device *mc_dev = vdev->mc_dev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) + if (info->index >= mc_dev->obj_desc.region_count) return -EINVAL; /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; return 0; } @@ -594,7 +581,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, - .get_region_info = vfio_fsl_mc_ioctl_get_region_info, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 0c65ed2217386..0759bd68edca0 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -435,10 +435,13 @@ static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) return remap_vmalloc_range(vma, mdev_state->memblk, 0); } -static int mdpy_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + if (region_info->index >= VFIO_PCI_NUM_REGIONS && region_info->index != MDPY_DISPLAY_REGION) return -EINVAL; @@ -512,34 +515,6 @@ static int mdpy_query_gfx_plane(struct mdev_state *mdev_state, return 0; } -static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info info; - void *cap_type = NULL; - u16 cap_type_id = 0; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); - if (ret) - return ret; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; - return 0; -} - static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -669,7 +644,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, - .get_region_info = mdpy_ioctl_get_region_info, + .get_region_info_caps = mdpy_ioctl_get_region_info, .mmap = mdpy_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index b27f9b93471bc..3e029d0cba1ea 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1717,10 +1717,12 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, return ret; } -static int mtty_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int size = 0; u32 bar_index; @@ -1785,34 +1787,6 @@ static int mtty_get_device_info(struct vfio_device_info *dev_info) return 0; } -static int mtty_ioctl_get_region_info(struct vfio_device *vdev, - struct vfio_region_info __user *arg) -{ - struct mdev_state *mdev_state = - container_of(vdev, struct mdev_state, vdev); - struct vfio_region_info info; - void *cap_type = NULL; - u16 cap_type_id = 0; - unsigned long minsz; - int ret; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, &cap_type); - if (ret) - return ret; - - if (copy_to_user(arg, &info, minsz)) - return -EFAULT; - return 0; -} - static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { @@ -1953,7 +1927,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, - .get_region_info = mtty_ioctl_get_region_info, + .get_region_info_caps = mtty_ioctl_get_region_info, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, From 54d50bbc611193284cb71592e7562e02890927d5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: [PATCH 25/51] vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson (cherry picked from commit 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc) Signed-off-by: Jiandi An --- drivers/vfio/vfio_main.c | 50 ++++++++++++++++++---------------------- include/linux/vfio.h | 2 -- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5aa229a3dee5d..f5c8939905252 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1266,42 +1266,36 @@ static long vfio_get_region_info(struct vfio_device *device, struct vfio_info_cap caps = {}; int ret; + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (device->ops->get_region_info_caps) { - ret = device->ops->get_region_info_caps(device, &info, &caps); - if (ret) - goto out_free; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, - caps.size)) { - ret = -EFAULT; - goto out_free; - } - info.cap_offset = sizeof(info); + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; } + info.cap_offset = sizeof(info); } + } - if (copy_to_user(arg, &info, minsz)) { - ret = -EFAULT; - goto out_free; - } - } else if (device->ops->get_region_info) { - ret = device->ops->get_region_info(device, arg); - if (ret) - return ret; - } else { - return -EINVAL; + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; } out_free: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc837701..8e1ddb48b9b54 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); From fd317b86093e9aa9e3793248f6c51d227e536d34 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:08:58 +0530 Subject: [PATCH 26/51] NVIDIA: VR: SAUCE: cxl: Add cxl_get_hdm_info() for HDM decoder metadata cxl_probe_component_regs() finds the HDM decoder block during device probe and caches its location, but does not record the decoder count and does not expose the result outside drivers/cxl/. vfio-cxl needs the decoder count and the byte offset and size of the HDM block without re-running the probe sequence. Record decoder_cnt in rmap->count when parsing the HDM capability in cxl_probe_component_regs(), extend struct cxl_reg_map with a count member, and add cxl_get_hdm_info() to return offset, size, and count from the cached map. Export under the CXL namespace; stub to -EOPNOTSUPP when CONFIG_CXL_BUS is off. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 29 +++++++++++++++++++++++++++++ drivers/cxl/core/regs.c | 1 + include/cxl/cxl.h | 16 ++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 497d99b8908d0..ed60951c80887 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -454,6 +454,35 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL"); +/** + * cxl_get_hdm_info - Get HDM decoder register block location and count + * @cxlds: CXL device state (must have component regs enumerated via + * cxl_probe_component_regs()) + * @count: number of HDM decoders in the block (from HDM Capability bits [3:0]) + * @offset: byte offset of HDM decoder block within the component register BAR + * @size: size in bytes of the HDM decoder block + * + * Return: 0 on success. -ENODEV if the HDM decoder block is not present. + */ +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ + struct cxl_reg_map *hdm = &cxlds->reg_map.component_map.hdm_decoder; + + if (WARN_ON(!count || !offset || !size)) + return -EINVAL; + + if (!hdm->valid) + return -ENODEV; + + *count = hdm->count; + *offset = hdm->offset; + *size = hdm->size; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hdm_info, "CXL"); + #define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff #define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 #define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 20c2d9fbcfe7d..e828df0629d02 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -85,6 +85,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, decoder_cnt = cxl_hdm_decoder_count(hdr); length = 0x20 * decoder_cnt + 0x10; rmap = &map->hdm_decoder; + rmap->count = decoder_cnt; break; } case CXL_CM_CAP_CAP_ID_RAS: diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 7d0b09ff57681..4910fe52b4c97 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -134,6 +134,7 @@ struct cxl_reg_map { int id; unsigned long offset; unsigned long size; + u8 count; }; struct cxl_component_reg_map { @@ -338,4 +339,19 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder **cxled, int ways); + +#ifdef CONFIG_CXL_BUS + +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); + +#else + +static inline +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ return -EOPNOTSUPP; } + +#endif /* CONFIG_CXL_BUS */ + #endif /* __CXL_CXL_H__ */ From e02c1b7ac02a72d006cdbb3cac4d38b76f291f64 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:08:59 +0530 Subject: [PATCH 27/51] NVIDIA: VR: SAUCE: cxl: Declare cxl_find_regblock and cxl_probe_component_regs in public header vfio-cxl lives outside drivers/cxl/ but still needs to locate the component register block and fill cxl_component_reg_map. Those prototypes were stuck in the internal drivers/cxl/cxl.h. Move the declarations to include/cxl/cxl.h next to the other vfio-facing hooks, with stubs when CXL bus support is disabled. Drop the duplicate prototypes from the private header. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Move cxl_probe_component_regs() to include/cxl/pci.h instead of include/cxl/cxl.h to align with existing Srirangan/Alejandro convention; skip cxl_find_regblock() move as it is already in include/cxl/pci.h; add struct cxl_component_reg_map forward declaration] Signed-off-by: Jiandi An --- drivers/cxl/cxl.h | 2 -- include/cxl/pci.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index f84910ba7fa2b..772cea6932109 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -145,8 +145,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -void cxl_probe_component_regs(struct device *dev, void __iomem *base, - struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, struct cxl_device_reg_map *map); int cxl_map_device_regs(const struct cxl_register_map *map, diff --git a/include/cxl/pci.h b/include/cxl/pci.h index edbf980c283f1..0b92aedbfbff9 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -16,10 +16,13 @@ enum cxl_regloc_type { struct pci_dev; struct cxl_register_map; +struct cxl_component_reg_map; int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map); int cxl_setup_regs(struct cxl_register_map *map); #endif From 199d5d2f2ca408f74664c35f23283842b787fb68 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:00 +0530 Subject: [PATCH 28/51] NVIDIA: VR: SAUCE: cxl: Move component/HDM register defines to uapi/cxl/cxl_regs.h VFIO and other code outside the CXL core needs the same offset/mask constants the core uses for the component register block and HDM decoders. Pull them into a new include/uapi/cxl/cxl_regs.h (GPL-2.0 WITH Linux-syscall-note) and include it from include/cxl/cxl.h. Use the uapi-friendly __GENMASK helpers where needed. Section comments in the new file reference CXL spec r4.0 numbering. For UAPI change, replaced the SZ_64K with actual size as the macro will not be available for userspace programs. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Remove defines from include/cxl/cxl.h instead of drivers/cxl/cxl.h as they were already moved there by Srirangan's SAUCE commit] Signed-off-by: Jiandi An --- include/cxl/cxl.h | 43 +--------------------------- include/uapi/cxl/cxl_regs.h | 57 +++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 42 deletions(-) create mode 100644 include/uapi/cxl/cxl_regs.h diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 4910fe52b4c97..915f99b56bd08 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * enum cxl_devtype - delineate type-2 from a generic type-3 device @@ -76,48 +77,6 @@ struct cxl_regs { #define CXL_CM_CAP_CAP_ID_HDM 0x5 #define CXL_CM_CAP_CAP_HDM_VERSION 1 -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - /* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ #define CXL_DECODER_MIN_GRANULARITY 256 #define CXL_DECODER_MAX_ENCODED_IG 6 diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h new file mode 100644 index 0000000000000..1a48a3805f52d --- /dev/null +++ b/include/uapi/cxl/cxl_regs.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * CXL Standard defines + * + * Hardware register offsets and bit-field masks for the CXL Component + * Register block, as defined by the CXL Specification r4.0. + */ + +#ifndef _UAPI_CXL_REGS_H_ +#define _UAPI_CXL_REGS_H_ + +#include /* _BITUL(), _BITULL() */ +#include /* __GENMASK() */ + +/* CXL 4.0 8.2.3 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE 0x00010000 + +/* CXL 4.0 8.2.4 CXL.cache and CXL.mem Registers*/ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK __GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK __GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK __GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK __GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK __GENMASK(31, 20) + +/* HDM decoders CXL 4.0 8.2.4.20 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE _BITUL(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK _BITUL(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT _BITUL(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED _BITUL(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR _BITUL(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY _BITUL(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +#endif /* _UAPI_CXL_REGS_H_ */ From d0fde9879972b3c304eb78ebc284da575ce2b549 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:01 +0530 Subject: [PATCH 29/51] NVIDIA: VR: SAUCE: cxl: Split cxl_await_range_active() from media-ready wait Before accessing CXL device memory after reset/power-on, the driver must ensure media is ready. Not every CXL device implements the CXL Memory Device register group (many Type-2 devices do not). cxl_await_media_ready() reads cxlds->regs.memdev. Access to the memory device registers on a Type-2 device may result in kernel panic. Split the HDM DVSEC range-active poll out of cxl_await_media_ready() into a new function, cxl_await_range_active(). Type-2 devices often lack the CXLMDEV status register, so they need the range check without the memdev read. cxl_await_media_ready() now calls cxl_await_range_active() for the DVSEC poll, then reads the memory device status as before. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap Reviewed-by: Dave Jiang (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Add cxl_await_range_active() declaration to include/cxl/pci.h unconditionally instead of include/cxl/cxl.h with CONFIG_CXL_BUS guards, consistent with existing convention] Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 35 ++++++++++++++++++++++++++++++----- include/cxl/pci.h | 2 ++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index ed60951c80887..873d86341b54c 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -147,16 +147,24 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) return 0; } -/* - * Wait up to @media_ready_timeout for the device to report memory - * active. +/** + * cxl_await_range_active - Wait for all HDM DVSEC memory ranges to be active + * @cxlds: CXL device state (DVSEC and HDM count must be valid) + * + * For each HDM decoder range reported in the CXL DVSEC capability, waits for + * the range to report MEM INFO VALID (up to 1s per range), then MEM ACTIVE + * (up to media_ready_timeout seconds per range, default 60s). Used by + * cxl_await_media_ready() and by callers that only need range readiness + * without checking the memory device status register. + * + * Return: 0 if all ranges become valid and active, -ETIMEDOUT if a timeout + * occurs, or a negative errno from config read on failure. */ -int cxl_await_media_ready(struct cxl_dev_state *cxlds) +int cxl_await_range_active(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; int rc, i, hdm_count; - u64 md_status; u16 cap; rc = pci_read_config_word(pdev, @@ -177,6 +185,23 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) return rc; } + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_range_active, "CXL"); + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + u64 md_status; + int rc; + + rc = cxl_await_range_active(cxlds); + if (rc) + return rc; + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO; diff --git a/include/cxl/pci.h b/include/cxl/pci.h index 0b92aedbfbff9..46fd1612dd2fe 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -17,6 +17,7 @@ enum cxl_regloc_type { struct pci_dev; struct cxl_register_map; struct cxl_component_reg_map; +struct cxl_dev_state; int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); @@ -24,5 +25,6 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); +int cxl_await_range_active(struct cxl_dev_state *cxlds); int cxl_setup_regs(struct cxl_register_map *map); #endif From d3141453f48b7ea7189db6c2310a856126f326be Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:02 +0530 Subject: [PATCH 30/51] NVIDIA: VR: SAUCE: cxl: Record BIR and BAR offset in cxl_register_map The Register Locator DVSEC (CXL 4.0 8.1.9) describes register blocks by BAR index (BIR) and offset within the BAR. CXL core currently only stores the resolved HPA (resource + offset) in struct cxl_register_map, so callers that need to use pci_iomap() or report the BAR to userspace must reverse-engineer the BAR from the HPA. Add bar_index and bar_offset to struct cxl_register_map and fill them in cxl_decode_regblock() when the regblock is BAR-backed (BIR 0-5). Add cxl_regblock_get_bar_info() so callers (e.g. vfio-cxl) can get BAR index and offset directly and use pci_iomap() instead of ioremap(HPA). Add cxl_regblock_get_bar_info() to return those fields; -EINVAL if the map is not BAR-backed. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Add cxl_regblock_get_bar_info() declaration to include/cxl/pci.h unconditionally instead of include/cxl/cxl.h with CONFIG_CXL_BUS guards, consistent with existing convention] Signed-off-by: Jiandi An --- drivers/cxl/core/regs.c | 29 +++++++++++++++++++++++++++++ include/cxl/cxl.h | 9 +++++++++ include/cxl/pci.h | 2 ++ 3 files changed, 40 insertions(+) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index e828df0629d02..43661e51230a2 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -288,9 +288,37 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; + map->bar_index = bar; + map->bar_offset = offset; return true; } +/** + * cxl_regblock_get_bar_info() - Get BAR index and offset for a BAR-backed + * regblock + * @map: Register map from cxl_find_regblock() or cxl_find_regblock_instance() + * @bar_index: Output BAR index (0-5). Optional, may be NULL. + * @bar_offset: Output offset within the BAR. Optional, may be NULL. + * + * When the register block was found via the Register Locator DVSEC and + * lives in a PCI BAR (BIR 0-5), this returns the BAR index and the offset + * within that BAR. + * + * Return: 0 if the regblock is BAR-backed (bar_index <= 5), -EINVAL otherwise. + */ +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset) +{ + if (!map || map->bar_index == 0xff) + return -EINVAL; + if (bar_index) + *bar_index = map->bar_index; + if (bar_offset) + *bar_offset = map->bar_offset; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_regblock_get_bar_info, "CXL"); + /* * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index * Use CXL_INSTANCES_COUNT for @index if counting instances. @@ -309,6 +337,7 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty *map = (struct cxl_register_map) { .host = &pdev->dev, + .bar_index = 0xFF, .resource = CXL_RESOURCE_NONE, }; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 915f99b56bd08..687329b18ae62 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -118,9 +118,16 @@ struct cxl_pmu_reg_map { * @resource: physical resource base of the register block * @max_size: maximum mapping size to perform register search * @reg_type: see enum cxl_regloc_type + * @bar_index: PCI BAR index (0-5) when regblock is BAR-backed; 0xFF otherwise + * @bar_offset: offset within the BAR; only valid when bar_index <= 5 * @component_map: cxl_reg_map for component registers * @device_map: cxl_reg_maps for device registers * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + * + * When the register block is described by the Register Locator DVSEC with + * a BAR Indicator (BIR 0-5), bar_index and bar_offset are set so callers can + * use pci_iomap(pdev, bar_index, size) and base + bar_offset instead of + * ioremap(resource). */ struct cxl_register_map { struct device *host; @@ -128,6 +135,8 @@ struct cxl_register_map { resource_size_t resource; resource_size_t max_size; u8 reg_type; + u8 bar_index; + resource_size_t bar_offset; union { struct cxl_component_reg_map component_map; struct cxl_device_reg_map device_map; diff --git a/include/cxl/pci.h b/include/cxl/pci.h index 46fd1612dd2fe..e5c018da0e1f5 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -26,5 +26,7 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map); int cxl_await_range_active(struct cxl_dev_state *cxlds); +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset); int cxl_setup_regs(struct cxl_register_map *map); #endif From 05c1da9d786a365424ef305f12fb1de47f7a1da5 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:03 +0530 Subject: [PATCH 31/51] NVIDIA: VR: SAUCE: vfio: UAPI for CXL-capable PCI device assignment Vendor GPUs and accelerators can expose CXL.mem (HDM-D or HDM-DB) without using PCI class code 0x0502. VMMs need a stable way to learn DPA sizing, firmware commit state, and where the extra VFIO regions live. Add VFIO_DEVICE_FLAGS_CXL (bit 9) and VFIO_DEVICE_INFO_CAP_CXL (cap ID 6). The capability struct carries: hdm_regs_bar_index PCI BAR containing the component register block hdm_regs_offset byte offset within that BAR to the CXL.mem area (comp_reg_offset + CXL_CM_OFFSET) dpa_region_index VFIO region index for the DPA window comp_regs_region_index VFIO region index for the emulated COMP_REGS HDM decoder count and the HDM block offset within COMP_REGS are intentionally absent; both are derivable from the CXL Capability Array at COMP_REGS offset 0. Locate cap ID 0x5 (HDM) and read bits[31:20] of its entry for the byte offset. Then read bits[3:0] of the HDM Decoder Capability register for the count: count = (field == 0) ? 1 : field * 2. Two flags accompany the capability: VFIO_CXL_CAP_FIRMWARE_COMMITTED A decoder covering @dpa_size bytes was programmed and committed by platform firmware before device open. The VMM can use the DPA region immediately without re-committing. VFIO_CXL_CAP_CACHE_CAPABLE The device is HDM-DB (CXL.mem + CXL.cache). HDM-DB requires a Write-Back Invalidation sequence before FLR to flush dirty cache lines; HDM-D (CXL.mem only) does not. QEMU uses this flag to schedule WBI and to report Back-Invalidation capability accurately in the virtual CXL topology. Mirrors the Cache_Capable bit from the CXL DVSEC Capability register. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- include/uapi/linux/vfio.h | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009baf..8394efb153e75 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -214,6 +214,16 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +/* + * Vendor-specific CXL device with CXL.mem capability (HDM-D or HDM-DB + * decoder, PCI class code != PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 + * accelerators and non-class-code Type-3 variants. When set, + * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The + * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL + * describing HDM decoders, region indices, decoder layout, and CXL-specific + * options. + */ +#define VFIO_DEVICE_FLAGS_CXL (1 << 9) /* Device supports CXL */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ @@ -256,6 +266,70 @@ struct vfio_device_info_cap_pci_atomic_comp { __u32 reserved; }; +/* + * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability + * + * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL + * is set. Describes Host Managed Device Memory (HDM) layout and CXL + * memory options so that userspace (e.g. QEMU) can expose the CXL region + * and component registers correctly to the guest. + * + * The HDM decoder count and HDM decoder block offset within the COMP_REGS + * region are derivable from the COMP_REGS region itself. + * + * To find the HDM decoder block offset (hdm_decoder_offset), traverse the CXL + * Capability Array starting at COMP_REGS region offset 0: + * - Dword 0 bits[31:24] (CXL_CM_CAP_HDR_ARRAY_SIZE_MASK): number of + * capability entries. + * - Each subsequent dword at offset (cap * 4): bits[15:0] = cap ID + * (CXL_CM_CAP_HDR_ID_MASK), bits[31:20] = byte offset from COMP_REGS + * start to that capability's register block (CXL_CM_CAP_PTR_MASK). + * - Locate the entry with cap ID == CXL_CM_CAP_CAP_ID_HDM (0x5); the + * extracted bits[31:20] value is directly the byte offset + * hdm_decoder_offset (no further scaling required). + * + * To find the HDM decoder count, pread the HDM Decoder Capability register + * at hdm_decoder_offset + CXL_HDM_DECODER_CAP_OFFSET within the + * COMP_REGS region; bits[3:0] (CXL_HDM_DECODER_COUNT_MASK) encode the count + * using the formula: count = (field == 0) ? 1 : field * 2. + */ +#define VFIO_DEVICE_INFO_CAP_CXL 6 +struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; + __u8 hdm_regs_bar_index; /* PCI BAR containing HDM registers */ + __u8 reserved[3]; + __u32 flags; +/* Decoder was committed by host firmware/BIOS */ +#define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) +/* + * Device implements an HDM-DB decoder (CXL.cache + CXL.mem). Reflects + * the Cache_Capable bit (bit 0) in the CXL DVSEC Capability register. + * + * When clear: HDM-D decoder (CXL.mem only, no CXL.cache). FLR does not + * require a Write-Back Invalidation (WBI) sequence; the device holds no + * coherent copies of host memory. + * + * When set: HDM-DB decoder (CXL 3.0+). The kernel driver does not + * perform Write-Back Invalidation (WBI) automatically. The VMM must + * issue a WBI sequence before asserting FLR to flush dirty device cache + * lines and prevent coherency violations, and should advertise + * Back-Invalidation support in the virtual CXL topology. + */ +#define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + /* + * Byte offset within the BAR to the CXL.mem register area start + * (= comp_reg_offset + CXL_CM_OFFSET). This is where the CXL + * Capability Array Header lives. + */ + __u64 hdm_regs_offset; + /* + * Region indices for the two CXL VFIO device regions. + * Avoids forcing userspace to scan all regions by type/subtype. + */ + __u32 dpa_region_index; /* VFIO_REGION_SUBTYPE_CXL */ + __u32 comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */ +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -369,6 +443,18 @@ struct vfio_region_info_cap_type { */ #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) +/* 1e98 vendor PCI sub-types (CXL Consortium) */ +/* + * CXL memory region. Use with region type + * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE). + * DPA memory region (fault+zap mmap) + */ +#define VFIO_REGION_SUBTYPE_CXL (1) +/* + * HDM decoder register emulation region (read/write only, no mmap). + */ +#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS (2) + /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) From de3e1a60a99529b88ac26bc826f0110c9076e9e0 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:04 +0530 Subject: [PATCH 32/51] NVIDIA: VR: SAUCE: vfio/pci: Add CXL state to vfio_pci_core_device Add CXL-specific state to vfio_pci_core_device structure to support CXL Type-2 device passthrough. The new vfio_pci_cxl_state structure embeds CXL core objects: - struct cxl_dev_state: CXL device state (from CXL core) - struct cxl_memdev: CXL memory device - struct cxl_region: CXL region object - Root and endpoint decoders Key design point: The CXL state pointer is NULL for non-CXL devices, allowing vfio-pci-core to handle both CXL and standard PCI devices with minimal overhead. This will follow the approach where vfio-pci-core itself gains CXL awareness, rather than requiring a separate variant driver. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve context mismatch in vfio_pci_core.h; add #include to vfio_cxl_priv.h for cxl_find_regblock/cxl_probe_component_regs declarations] Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 29 ++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 2 ++ 2 files changed, 31 insertions(+) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_priv.h diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h new file mode 100644 index 0000000000000..0ea1d8ddbd492 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common infrastructure for CXL Type-2 device variant drivers + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef __LINUX_VFIO_CXL_PRIV_H +#define __LINUX_VFIO_CXL_PRIV_H + +#include +#include +#include + +/* CXL device state embedded in vfio_pci_core_device */ +struct vfio_pci_cxl_state { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; + struct cxl_root_decoder *cxlrd; + struct cxl_endpoint_decoder *cxled; + resource_size_t hdm_reg_offset; + size_t hdm_reg_size; + resource_size_t comp_reg_offset; + size_t comp_reg_size; + u8 hdm_count; + u8 comp_reg_bar; +}; + +#endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2a9c289ef5f29..2f0fb18ec1ff3 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -27,6 +27,7 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct vfio_pci_cxl_state; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -96,6 +97,7 @@ struct vfio_pci_core_device { struct mutex ioeventfds_lock; struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; + struct vfio_pci_cxl_state *cxl; struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; From cb87876e8e1d732e4aa1c630fdf9f41f8a7794d9 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:05 +0530 Subject: [PATCH 33/51] NVIDIA: VR: SAUCE: vfio/pci: Add CONFIG_VFIO_CXL_CORE and stub CXL hooks Introduce the Kconfig option CONFIG_VFIO_CXL_CORE and the necessary build rules to compile CXL.mem passthrough infrastructure for vendor-specific CXL devices into the vfio-pci-core module. The new option depends on VFIO_PCI_CORE, CXL_BUS and CXL_MEM. Wire up the detection and cleanup entry-point stubs in vfio_pci_core_register_device() and vfio_pci_core_unregister_device() so that subsequent patches can fill in the CXL-specific logic without touching the vfio-pci-core flow again. The vfio_cxl_core.c file added here is an empty skeleton; the actual CXL detection and initialisation code is introduced in the following patch to keep this build-system patch reviewable on its own. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve context mismatches in Kconfig, Makefile, and vfio_pci_priv.h due to missing upstream xe/dmabuf support in NV-Kernels base] Signed-off-by: Jiandi An --- drivers/vfio/pci/Kconfig | 2 ++ drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/cxl/Kconfig | 9 ++++++ drivers/vfio/pci/cxl/vfio_cxl_core.c | 41 ++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 4 +++ drivers/vfio/pci/vfio_pci_priv.h | 14 ++++++++++ 6 files changed, 71 insertions(+) create mode 100644 drivers/vfio/pci/cxl/Kconfig create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_core.c diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f546652..878f95b4d7923 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -65,6 +65,8 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/cxl/Kconfig" + source "drivers/vfio/pci/qat/Kconfig" endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c8..ae15a743c73ad 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/Kconfig b/drivers/vfio/pci/cxl/Kconfig new file mode 100644 index 0000000000000..fad53300fecfb --- /dev/null +++ b/drivers/vfio/pci/cxl/Kconfig @@ -0,0 +1,9 @@ +config VFIO_CXL_CORE + bool "VFIO CXL core" + depends on VFIO_PCI_CORE && CXL_BUS && CXL_MEM + help + Extends vfio-pci-core with CXL.mem passthrough for vendor-specific + CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or HDM-DB + decoders without the standard CXL memory expander class code + (PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 accelerators and + non-class-code Type-3 variants (e.g. compressed memory devices). diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c new file mode 100644 index 0000000000000..d12afec82ecdb --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO CXL Core - CXL.mem passthrough for vendor-specific CXL devices + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * + * This module extends vfio-pci-core to pass through CXL.mem regions for + * vendor-specific CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or + * HDM-DB decoders but do not report the standard CXL memory expander class + * code (PCI_CLASS_MEMORY_CXL, 0x0502). This covers both CXL Type-2 + * accelerators (with CXL.cache) and non-class-code Type-3 variants (e.g. + * compressed memory devices) which cannot be paravirtualized by the host + * CXL subsystem and require direct DPA region access from the guest. + */ + +#include +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/** + * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific + * CXL.mem device + * @vdev: VFIO PCI device + * + * Called from vfio_pci_core_register_device(). Detects CXL DVSEC capability + * and initializes CXL features. On failure vdev->cxl remains NULL and the + * device operates as a standard PCI device. + */ +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) +{ +} + +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) +{ +} + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 59866f5436b46..1f293b3835294 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2168,6 +2168,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (ret) goto out_vf; + vfio_pci_cxl_detect_and_init(vdev); + vfio_pci_probe_power_state(vdev); /* @@ -2211,6 +2213,8 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vf_uninit(vdev); vfio_pci_vga_uninit(vdev); + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index c3eb839a3c705..69211b7346be2 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -110,4 +110,18 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); + +#else + +static inline void +vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } + +#endif /* CONFIG_VFIO_CXL_CORE */ + #endif From 84fbfbcead316e4a634190d797d947d63b974dba Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:06 +0530 Subject: [PATCH 34/51] NVIDIA: VR: SAUCE: vfio/cxl: Detect CXL DVSEC and probe HDM block Detect a vendor-specific CXL device at vfio-pci bind time and probe its HDM decoder register block. vfio_cxl_create_device_state() allocates per-device state via devm and reads MEM_CAPABLE and CACHE_CAPABLE from the CXL DVSEC. vfio_cxl_setup_regs() locates the component register block, temporarily maps it, calls cxl_probe_component_regs() to find the HDM block, then releases the mapping. vfio_pci_cxl_detect_and_init() chains these two steps. If either fails, vdev->cxl stays NULL and the device falls back to plain vfio-pci. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 217 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 12 ++ 2 files changed, 229 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index d12afec82ecdb..b1c7603590b54 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,158 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +/* + * vfio_cxl_create_device_state - Allocate and validate CXL device state + * + * Returns a pointer to the allocated vfio_pci_cxl_state on success, or + * ERR_PTR on failure. The allocation uses devm; the caller must call + * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release + * the resource before device unbind. Using devm_kfree() to undo a devm + * allocation early is explicitly supported by the devres API. + * + * The caller assigns vdev->cxl only after all setup steps succeed, preventing + * partially-initialised state from being visible through vdev->cxl on any + * failure path. + */ +static struct vfio_pci_cxl_state * +vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) +{ + struct vfio_pci_cxl_state *cxl; + u16 cap_word; + u32 hdr1; + + /* Freed automatically when pdev->dev is released. */ + cxl = devm_cxl_dev_state_create(&pdev->dev, + CXL_DEVTYPE_DEVMEM, + pdev->dev.id, dvsec, + struct vfio_pci_cxl_state, + cxlds, false); + if (!cxl) + return ERR_PTR(-ENOMEM); + + pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1); + cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1); + + pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET, + &cap_word); + + /* + * Only handle vendor devices (class != 0x0502) with Mem_Capable set. + * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI + * sequence is needed before FLR. + */ + if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) || + (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { + devm_kfree(&pdev->dev, cxl); + return ERR_PTR(-ENODEV); + } + + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word); + + return cxl; +} + +static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct cxl_register_map *map = &cxl->cxlds.reg_map; + resource_size_t offset, bar_offset, size; + struct pci_dev *pdev = vdev->pdev; + void __iomem *base; + int ret; + u8 count; + u8 bar; + + if (WARN_ON_ONCE(!pci_is_enabled(pdev))) + return -EINVAL; + + /* Find component register block via Register Locator DVSEC */ + ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map); + if (ret) + return ret; + + /* + * Request the region and map. This is a transient mapping + * used only to probe register capabilities; released immediately + * after cxl_probe_component_regs() returns. + */ + if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe")) + return -EBUSY; + + base = ioremap(map->resource, map->max_size); + if (!base) { + ret = -ENOMEM; + goto failed_release; + } + + /* Probe component register capabilities */ + cxl_probe_component_regs(&pdev->dev, base, &map->component_map); + + /* Check if HDM decoder was found */ + if (!map->component_map.hdm_decoder.valid) { + ret = -ENODEV; + goto failed_unmap; + } + + pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n", + map->component_map.hdm_decoder.offset, + map->component_map.hdm_decoder.size); + + /* Get HDM register info */ + ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size); + if (ret) + goto failed_unmap; + + if (!count || !size) { + ret = -ENODEV; + goto failed_unmap; + } + + cxl->hdm_count = count; + /* + * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + + * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before + * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset). + * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem + * register area start, which is where comp_reg_virt[0] is anchored. + * The physical BAR address for hdm_iobase is recovered by adding + * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs(). + */ + cxl->hdm_reg_offset = offset - CXL_CM_OFFSET; + cxl->hdm_reg_size = size; + + ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset); + if (ret) + goto failed_unmap; + + cxl->comp_reg_bar = bar; + cxl->comp_reg_offset = bar_offset; + cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + iounmap(base); + release_mem_region(map->resource, map->max_size); + + return 0; + +failed_unmap: + iounmap(base); +failed_release: + release_mem_region(map->resource, map->max_size); + + return ret; +} + +/* + * Free CXL state early on probe failure. devm_kfree() on a live devres + * allocation removes it from the list immediately, so the normal devres + * teardown at unbind time won't double-free it. + */ +static void vfio_cxl_dev_state_free(struct pci_dev *pdev, + struct vfio_pci_cxl_state *cxl) +{ + devm_kfree(&pdev->dev, cxl); +} + /** * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific * CXL.mem device @@ -32,10 +184,75 @@ */ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { + struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_cxl_state *cxl; + u16 dvsec; + int ret; + + if (!pcie_is_cxl(pdev)) + return; + + dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + /* + * CXL DVSEC found: any failure from here is a hard probe error on + * a confirmed CXL-capable device, not a silent non-CXL fallback. + * Warn the operator so misconfiguration is visible. + */ + cxl = vfio_cxl_create_device_state(pdev, dvsec); + if (IS_ERR(cxl)) { + if (PTR_ERR(cxl) != -ENODEV) + pci_warn(pdev, + "vfio-cxl: CXL device state allocation failed: %ld\n", + PTR_ERR(cxl)); + return; + } + + /* + * Required for ioremap of the component register block and + * calls to cxl_probe_component_regs(). + */ + ret = pci_enable_device_mem(pdev); + if (ret) { + pci_warn(pdev, + "vfio-cxl: pci_enable_device_mem failed: %d\n", ret); + goto free_cxl; + } + + ret = vfio_cxl_setup_regs(vdev, cxl); + if (ret) { + pci_warn(pdev, + "vfio-cxl: HDM register probing failed: %d\n", ret); + pci_disable_device(pdev); + goto free_cxl; + } + + pci_disable_device(pdev); + + /* + * Register probing succeeded. Assign vdev->cxl now so that + * all subsequent helpers can access state via vdev->cxl. + * All failure paths below clear vdev->cxl before calling + * vfio_cxl_dev_state_free(). + */ + vdev->cxl = cxl; + + return; + +free_cxl: + vfio_cxl_dev_state_free(pdev, cxl); } void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl) + return; } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 0ea1d8ddbd492..bb03f9363d98f 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -22,8 +22,20 @@ struct vfio_pci_cxl_state { size_t hdm_reg_size; resource_size_t comp_reg_offset; size_t comp_reg_size; + u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; + bool cache_capable; }; +/* + * CXL DVSEC for CXL Devices - register offsets within the DVSEC + * (CXL 4.0 8.1.3). + * Offsets are relative to the DVSEC capability base (cxl->dvsec). + */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0xa +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ +#define CXL_DVSEC_CACHE_CAPABLE BIT(0) + #endif /* __LINUX_VFIO_CXL_PRIV_H */ From 0fbd7b2effd79b7ff8fa3e739900a99734a71289 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:07 +0530 Subject: [PATCH 35/51] NVIDIA: VR: SAUCE: vfio/pci: Export config access helpers Promote vfio_raw_config_write() and vfio_raw_config_read() to non-static so that the CXL DVSEC write handler in the next patch can call them. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/vfio_pci_config.c | 12 ++++++------ drivers/vfio/pci/vfio_pci_priv.h | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 4abd4f2719958..41223ce4014b9 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -270,9 +270,9 @@ static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, } /* Raw access skips any kind of virtualization */ -static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -283,9 +283,9 @@ static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, return count; } -static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 69211b7346be2..61e1a2fd5e5b5 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -37,6 +37,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); + +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); From ad3979839aba3255b00abdceff1ba21f57e19a00 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:08 +0530 Subject: [PATCH 36/51] NVIDIA: VR: SAUCE: vfio/cxl: Introduce HDM decoder register emulation framework Add HDM decoder register emulation for CXL devices assigned to a guest. New file vfio_cxl_emu.c allocates comp_reg_virt[] covering the full component register block (CXL_COMPONENT_REG_BLOCK_SIZE), snapshots it from MMIO after probe, and registers a VFIO device region (VFIO_REGION_SUBTYPE_CXL_COMP_REGS) with read/write ops but no mmap, so every access hits the emulated buffer and write dispatchers. vfio_cxl_setup_virt_regs() is called from the tail of vfio_cxl_setup_regs(); vfio_cxl_clean_virt_regs() runs on cleanup. HDM decoder register defines come from include/uapi/cxl/cxl_regs.h. Bits with no hardware equivalent stay in vfio_cxl_priv.h. hdm_decoder_n_ctrl_write() allows the guest to clear the LOCK bit. A firmware-committed decoder arrives with LOCK=1; the guest driver must clear it before reprogramming BASE and SIZE with the VM's GPA. Such a write clears the bit in the shadow while preserving all other fields. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve Makefile context mismatch due to missing upstream dmabuf support in NV-Kernels base] Signed-off-by: Jiandi An --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/cxl/vfio_cxl_core.c | 5 + drivers/vfio/pci/cxl/vfio_cxl_emu.c | 433 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 47 +++ include/uapi/cxl/cxl_regs.h | 5 + 5 files changed, 491 insertions(+), 1 deletion(-) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_emu.c diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index ae15a743c73ad..0a2f8daafe424 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o -vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index b1c7603590b54..0b9e4419cd475 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -149,8 +149,11 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, cxl->comp_reg_offset = bar_offset; cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + ret = vfio_cxl_setup_virt_regs(vdev, cxl, base); iounmap(base); release_mem_region(map->resource, map->max_size); + if (ret) + return ret; return 0; @@ -253,6 +256,8 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) if (!cxl) return; + + vfio_cxl_clean_virt_regs(cxl); } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c new file mode 100644 index 0000000000000..6fb02253e6312 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/* + * comp_reg_virt[] shadow layout: + * Covers the full CXL.mem register area (starting at CXL_CM_OFFSET + * within the component register block). Index 0 is the CXL Capability + * Array Header; the HDM decoder block starts at index + * hdm_reg_offset / sizeof(__le32). + * + * Register layout within the HDM block (CXL spec 4.0 8.2.4.20 CXL HDM Decoder + * Capability Structure): + * 0x00: HDM Decoder Capability + * 0x04: HDM Decoder Global Control + * 0x08: (reserved) + * 0x0c: (reserved) + * For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20: + * +0x00: BASE_LO + * +0x04: BASE_HI + * +0x08: SIZE_LO + * +0x0c: SIZE_HI + * +0x10: CTRL + * +0x14: TARGET_LIST_LO + * +0x18: TARGET_LIST_HI + * +0x1c: (reserved) + */ + +static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +{ + /* + * hdm_off is a byte offset within the HDM decoder block. + * comp_reg_virt covers the CXL.mem register area starting at + * CXL_CM_OFFSET within the component register block. + * hdm_reg_offset is CXL.mem-relative, so adding hdm_reg_offset + * gives the correct index into comp_reg_virt[]. + */ + return &cxl->comp_reg_virt[(cxl->hdm_reg_offset + hdm_off) / + sizeof(__le32)]; +} + +static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + /* Discard writes on reserved registers. */ + return size; +} + +static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bits [27:0] are reserved. */ + new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 size) +{ + u32 hdm_gcap; + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bit [31:2] are reserved. */ + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK; + + /* Poison On Decode Error Enable (bit 0) is RO=0 if not supported. */ + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + if (!(hdm_gcap & CXL_HDM_DECODER_POISON_ON_DECODE_ERR)) + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT; + + *hdm_reg_ptr(vdev->cxl, CXL_HDM_DECODER_CTRL_OFFSET) = + cpu_to_le32(new_val); + + return size; +} + +/** + * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register. + * @vdev: VFIO PCI core device + * @val32: New register value supplied by userspace (little-endian) + * @offset: Byte offset within the HDM block for this decoder's CTRL register + * @size: Access size in bytes; must equal CXL_REG_SIZE_DWORD + * + * The COMMIT bit (bit 9) is the key: setting it requests the hardware to + * lock the decoder. The emulated COMMITTED bit (bit 10) mirrors COMMIT + * immediately to allow QEMU's notify_change to detect the transition and + * map/unmap the DPA MemoryRegion in the guest address space. + * + * Note: the actual hardware HDM decoder programming (writing the real + * BASE/SIZE with host physical addresses) happens in the QEMU notify_change + * callback BEFORE this write reaches the hardware. This ordering is + * correct because vfio_region_write() calls notify_change() first. + * + * Return: @size on success, %-EINVAL if @size is not %CXL_REG_SIZE_DWORD. + */ +static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 hdm_gcap; + u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK; + u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK; + u32 new_val = le32_to_cpu(*val32); + u32 cur_val; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset)); + if (cur_val & CXL_HDM_DECODER0_CTRL_LOCK) { + if (new_val & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* LOCK_0 only: preserve all other bits, clear LOCK */ + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32( + cur_val & ~CXL_HDM_DECODER0_CTRL_LOCK); + return size; + } + + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO; + rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED; + + if (!(hdm_gcap & CXL_HDM_DECODER_UIO_CAPABLE)) + rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED; + + new_val &= ~rev_mask; + cur_val &= ro_mask; + new_val = (new_val & ~ro_mask) | cur_val; + + /* + * Mirror COMMIT to COMMITTED immediately in the emulated state. + */ + if (new_val & CXL_HDM_DECODER0_CTRL_COMMIT) + new_val |= CXL_HDM_DECODER0_CTRL_COMMITTED; + else + new_val &= ~CXL_HDM_DECODER0_CTRL_COMMITTED; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +/* + * Dispatch table for COMP_REGS region writes. Indexed by byte offset within + * the HDM decoder block. Returns the appropriate write handler. + * + * Layout: + * 0x00 HDM Decoder Capability (RO) + * 0x04 HDM Global Control (RW with reserved masking) + * 0x08-0x0f (reserved) (ignored) + * Per decoder N, base = 0x10 + N*0x20: + * base+0x00 BASE_LO (RW, [27:0] reserved) + * base+0x04 BASE_HI (RW) + * base+0x08 SIZE_LO (RW, [27:0] reserved) + * base+0x0c SIZE_HI (RW) + * base+0x10 CTRL (RW, complex rules) + * base+0x14 TARGET_LIST_LO (ignored for Type-2) + * base+0x18 TARGET_LIST_HI (ignored for Type-2) + * base+0x1c (reserved) (ignored) + */ +static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev, + u32 off, const __le32 *val32, u32 size) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 dec_base, dec_off; + + /* HDM Decoder Capability (0x00): RO */ + if (off == CXL_HDM_DECODER_CAP_OFFSET) + return size; + + /* HDM Global Control (0x04) */ + if (off == CXL_HDM_DECODER_CTRL_OFFSET) + return hdm_decoder_global_ctrl_write(vdev, val32, size); + + /* + * Offsets 0x08-0x0f are reserved per CXL 4.0 Table 8-115. + * Per-decoder registers start at 0x10, stride 0x20 + */ + if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) + return size; /* reserved gap */ + + dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET; + /* + * Reject accesses beyond the last implemented HDM decoder. + * Without this check an out-of-bounds offset would silently + * corrupt comp_reg_virt[] memory past the end of the allocation. + */ + if ((off - dec_base) / CXL_HDM_DECODER_BLOCK_STRIDE >= cxl->hdm_count) + return size; + + dec_off = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE; + + switch (dec_off) { + case CXL_HDM_DECODER_N_BASE_LOW_OFFSET: /* BASE_LO */ + case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET: /* SIZE_LO */ + return hdm_decoder_n_lo_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */ + case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */ + { + /* Full 32-bit write, no reserved bits; frozen when COMMIT_LOCK set */ + u32 ctrl_off = off - dec_off + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + *hdm_reg_ptr(cxl, off) = *val32; + return size; + } + case CXL_HDM_DECODER_N_CTRL_OFFSET: /* CTRL */ + return hdm_decoder_n_ctrl_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET: + case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET: + case CXL_HDM_DECODER_N_REV_OFFSET: + return virt_hdm_rev_reg_write(vdev, val32, off, size); + default: + return size; + } +} + +/* + * vfio_cxl_comp_regs_rw - regops rw handler for + * VFIO_REGION_SUBTYPE_CXL_COMP_REGS. + * + * Reads return the emulated HDM state (comp_reg_virt[]). + * Writes go through comp_regs_dispatch_write() for bit-field enforcement. + * Only 4-byte aligned 4-byte accesses are supported (hardware requirement). + */ +static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t done = 0; + + if (!count) + return 0; + + /* Clamp to total region size: cap array prefix + HDM block */ + if (pos >= cxl->hdm_reg_offset + cxl->hdm_reg_size) + return -EINVAL; + count = min(count, + (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + + while (done < count) { + u32 sz = count - done; + u32 off = pos + done; + __le32 v; + + /* Enforce exactly 4-byte, 4-byte-aligned accesses */ + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) + return done ? (ssize_t)done : -EINVAL; + + if (iswrite) { + if (off < cxl->hdm_reg_offset) { + /* Cap array area is read-only; discard writes */ + done += sizeof(v); + continue; + } + if (copy_from_user(&v, buf + done, sizeof(v))) + return done ? (ssize_t)done : -EFAULT; + comp_regs_dispatch_write(vdev, + off - cxl->hdm_reg_offset, + &v, sizeof(v)); + } else { + /* Read from extended buffer _ covers cap array and HDM */ + v = cxl->comp_reg_virt[off / sizeof(__le32)]; + if (copy_to_user(buf + done, &v, sizeof(v))) + return done ? (ssize_t)done : -EFAULT; + } + done += sizeof(v); + } + + *ppos += done; + return done; +} + +static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + /* comp_reg_virt is freed in vfio_cxl_clean_virt_regs() */ +} + +static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = { + .rw = vfio_cxl_comp_regs_rw, + .release = vfio_cxl_comp_regs_release, +}; + +/* + * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state. + * + * Allocates comp_reg_virt as a compact __le32 array covering only + * hdm_reg_size bytes of HDM decoder registers. The initial values + * are read from hardware via the BAR ioremap established by the caller. + * + * DVSEC state is accessed via vdev->vconfig (see the following patch). + */ +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base) +{ + size_t total_size, nregs, i; + + if (WARN_ON(!cxl->hdm_reg_size)) + return -EINVAL; + + total_size = cxl->hdm_reg_offset + cxl->hdm_reg_size; + + if (pci_resource_len(vdev->pdev, cxl->comp_reg_bar) < + cxl->comp_reg_offset + CXL_CM_OFFSET + total_size) + return -ENODEV; + + nregs = total_size / sizeof(__le32); + cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL); + if (!cxl->comp_reg_virt) + return -ENOMEM; + + /* + * Snapshot the CXL.mem register area from the caller's mapping. + * cap_base maps the component register block from comp_reg_offset. + * The CXL.mem registers start at CXL_CM_OFFSET (= 0x1000) within that + * block; reading from cap_base + CXL_CM_OFFSET ensures comp_reg_virt[0] + * holds the CXL Capability Array Header required by guest drivers. + */ + for (i = 0; i < nregs; i++) + cxl->comp_reg_virt[i] = + cpu_to_le32(readl(cap_base + CXL_CM_OFFSET + + i * sizeof(__le32))); + + /* + * Establish persistent mapping; kept alive until + * vfio_cxl_clean_virt_regs(). + */ + cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, + cxl->comp_reg_bar) + + cxl->comp_reg_offset + CXL_CM_OFFSET + + cxl->hdm_reg_offset, + cxl->hdm_reg_size); + if (!cxl->hdm_iobase) { + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; + return -ENOMEM; + } + + return 0; +} + +/* + * Called with memory_lock write side held (from vfio_cxl_reactivate_region). + * Uses the pre-established hdm_iobase, no ioremap() under the lock, + * which would deadlock on PREEMPT_RT where ioremap() can sleep. + */ +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl) +{ + size_t i, nregs; + u32 n; + + if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase) + return; + + nregs = cxl->hdm_reg_size / sizeof(__le32); + + for (i = 0; i < nregs; i++) + *hdm_reg_ptr(cxl, i * sizeof(__le32)) = + cpu_to_le32(readl(cxl->hdm_iobase + + i * sizeof(__le32))); + + /* + * For firmware-committed decoders, clear COMMIT_LOCK (bit 8) and zero + * BASE in comp_reg_virt[] so QEMU can write the correct guest GPA via + * setup_locked_hdm() before guest DPA access begins. + * + * Check the COMMITTED bit (bit 10) directly from the freshly-snapshotted + * ctrl register rather than relying on cxl->precommitted. At probe time + * this function is called before cxl->precommitted is set (it is set + * after vfio_cxl_read_committed_decoder_size() succeeds), so using + * cxl->precommitted here would silently skip the LOCK clearing and leave + * the hardware HPA in comp_reg_virt[]. + */ + for (n = 0; n < cxl->hdm_count; n++) { + u32 ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 base_lo_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_LOW_OFFSET; + u32 base_hi_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_HIGH_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) { + *hdm_reg_ptr(cxl, ctrl_off) = + cpu_to_le32(ctrl & + ~CXL_HDM_DECODER0_CTRL_LOCK); + *hdm_reg_ptr(cxl, base_lo_off) = 0; + *hdm_reg_ptr(cxl, base_hi_off) = 0; + } + } +} + +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) +{ + if (cxl->hdm_iobase) { + iounmap(cxl->hdm_iobase); + cxl->hdm_iobase = NULL; + } + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; +} diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index bb03f9363d98f..242409144ba0b 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -22,12 +22,53 @@ struct vfio_pci_cxl_state { size_t hdm_reg_size; resource_size_t comp_reg_offset; size_t comp_reg_size; + __le32 *comp_reg_virt; + void __iomem *hdm_iobase; u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; bool cache_capable; }; +/* Register access sizes */ +#define CXL_REG_SIZE_WORD 2 +#define CXL_REG_SIZE_DWORD 4 + +/* HDM Decoder - register offsets (CXL 4.0 Table 8-115) */ +#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET 0x10 +#define CXL_HDM_DECODER_BLOCK_STRIDE 0x20 +#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET 0x0 +#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET 0x4 +#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET 0x8 +#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET 0xc +#define CXL_HDM_DECODER_N_CTRL_OFFSET 0x10 +#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET 0x14 +#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18 +#define CXL_HDM_DECODER_N_REV_OFFSET 0x1c + +/* + * HDM Decoder N Control emulation masks. + * + * Single-bit hardware definitions are in as + * CXL_HDM_DECODER0_CTRL_* (bits 0-14) and CXL_HDM_DECODER_*_CAP. + * The masks below express emulation policy for a CXL.mem device. + */ +#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK (BIT(10) | BIT(11)) +#define CXL_HDM_DECODER_CTRL_RESERVED_MASK (BIT(15) | GENMASK(31, 28)) +#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO BIT(12) +#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED (GENMASK(19, 16) | GENMASK(23, 20)) +#define CXL_HDM_DECODER_CTRL_UIO_RESERVED (BIT(14) | GENMASK(27, 24)) +/* + * bit 13 (BI) is RsvdP for devices without CXL.cache (Cache_Capable=0). + * HDM-D (CXL.mem only) decoders must not have BI set by the guest. + */ +#define CXL_HDM_DECODER_CTRL_BI_RESERVED BIT(13) +#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK GENMASK(27, 0) + +#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2) +#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) + /* * CXL DVSEC for CXL Devices - register offsets within the DVSEC * (CXL 4.0 8.1.3). @@ -38,4 +79,10 @@ struct vfio_pci_cxl_state { /* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ #define CXL_DVSEC_CACHE_CAPABLE BIT(0) +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base); +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); + #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h index 1a48a3805f52d..b6fcae91d216b 100644 --- a/include/uapi/cxl/cxl_regs.h +++ b/include/uapi/cxl/cxl_regs.h @@ -33,8 +33,13 @@ #define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) #define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) #define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_POISON_ON_DECODE_ERR _BITUL(10) #define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) #define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_UIO_CAPABLE _BITUL(13) +#define CXL_HDM_DECODER_UIO_COUNT_MASK __GENMASK(19, 16) +#define CXL_HDM_DECODER_MEMDATA_NXM _BITUL(20) +#define CXL_HDM_DECODER_COHERENCY_MODELS_MASK __GENMASK(22, 21) #define CXL_HDM_DECODER_CTRL_OFFSET 0x4 #define CXL_HDM_DECODER_ENABLE _BITUL(1) #define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) From d64c61c2ba91b248c4b980baeb229e66be1b84a1 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:09 +0530 Subject: [PATCH 37/51] NVIDIA: VR: SAUCE: vfio/cxl: Wait for HDM ranges and create memdev After HDM registers are mapped, call cxl_await_range_active() so we only proceed when DVSEC ranges report active without touching the memdev register group Type-2 may lack. Re-snapshot component regs (vfio_cxl_reinit_comp_regs) once MEM_ACTIVE so firmware final SIZE_HIGH etc. land in comp_reg_virt. Read committed decoder size from hardware, set capacity via cxl_set_capacity(), and devm_cxl_add_memdev(). Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 56 ++++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_emu.c | 42 +++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 4 ++ 3 files changed, 102 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 0b9e4419cd475..02755265d530b 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -165,6 +165,22 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, return ret; } +static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + int ret; + + ret = cxl_set_capacity(&cxl->cxlds, capacity); + if (ret) + return ret; + + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) + return PTR_ERR(cxl->cxlmd); + + return 0; +} + /* * Free CXL state early on probe failure. devm_kfree() on a live devres * allocation removes it from the list immediately, so the normal devres @@ -189,6 +205,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_cxl_state *cxl; + resource_size_t capacity = 0; u16 dvsec; int ret; @@ -234,8 +251,44 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto free_cxl; } + cxl->cxlds.media_ready = !cxl_await_range_active(&cxl->cxlds); + if (!cxl->cxlds.media_ready) { + pci_warn(pdev, "CXL media not ready\n"); + pci_disable_device(pdev); + goto regs_failed; + } + + /* + * Take the single authoritative HDM decoder snapshot now that + * MEM_ACTIVE is confirmed and BAR memory is still enabled. Using + * readl() per-dword ensures correct MMIO serialisation and captures + * the final firmware-written values for all fields including SIZE_HIGH, + * which firmware commits to the BAR at MEM_ACTIVE time. + */ + vfio_cxl_reinit_comp_regs(cxl); + pci_disable_device(pdev); + capacity = vfio_cxl_read_committed_decoder_size(vdev, cxl); + if (capacity == 0) { + /* + * TODO: Add handling for devices which do not have + * firmware pre-committed decoders + */ + pci_info(pdev, "Uncommitted region size must be configured via sysfs before bind\n"); + goto regs_failed; + } + + cxl->dpa_size = capacity; + + pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); + + ret = vfio_cxl_create_memdev(cxl, capacity); + if (ret) { + pci_warn(pdev, "Failed to create memdev\n"); + goto regs_failed; + } + /* * Register probing succeeded. Assign vdev->cxl now so that * all subsequent helpers can access state via vdev->cxl. @@ -246,6 +299,9 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) return; +regs_failed: + vfio_cxl_clean_virt_regs(cxl); + free_cxl: vfio_cxl_dev_state_free(pdev, cxl); } diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 6fb02253e6312..11195e8c21d79 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -365,6 +365,48 @@ int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, return 0; } +/* + * vfio_cxl_read_committed_decoder_size - Extract committed DPA capacity from + * comp_reg_virt[]. + * + * Called from probe context after vfio_cxl_reinit_comp_regs() has taken the + * post-MEM_ACTIVE readl() snapshot and patched SIZE_HIGH/SIZE_LOW from DVSEC. + * comp_reg_virt[] is already correct at this point; no hardware access needed. + * + * Returns the committed DPA capacity in bytes, or 0 if the decoder is not + * committed. + */ +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct pci_dev *pdev = vdev->pdev; + resource_size_t capacity; + u32 ctrl, sz_hi, sz_lo; + + if (WARN_ON(!cxl || !cxl->comp_reg_virt)) + return 0; + + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_CTRL_OFFSET(0))); + sz_hi = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(0))); + sz_lo = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_LOW_OFFSET(0))); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) { + pci_dbg(pdev, + "vfio_cxl: decoder0 not committed: ctrl=0x%08x\n", + ctrl); + return 0; + } + + capacity = ((resource_size_t)sz_hi << 32) | (sz_lo & GENMASK(31, 28)); + + pci_dbg(pdev, + "vfio_cxl: decoder0 committed: sz_hi=0x%08x sz_lo=0x%08x capacity=0x%llx\n", + sz_hi, sz_lo, (unsigned long long)capacity); + + return capacity; +} + /* * Called with memory_lock write side held (from vfio_cxl_reactivate_region). * Uses the pre-established hdm_iobase, no ioremap() under the lock, diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 242409144ba0b..0fab271c96d68 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -23,6 +23,7 @@ struct vfio_pci_cxl_state { resource_size_t comp_reg_offset; size_t comp_reg_size; __le32 *comp_reg_virt; + size_t dpa_size; void __iomem *hdm_iobase; u16 dvsec_len; u8 hdm_count; @@ -84,5 +85,8 @@ int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, void __iomem *cap_base); void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl); #endif /* __LINUX_VFIO_CXL_PRIV_H */ From fb580ac046d5762ee37cee3e0714b03c2ffc22d5 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:10 +0530 Subject: [PATCH 38/51] NVIDIA: VR: SAUCE: vfio/cxl: CXL region management support Region Management makes use of APIs provided by CXL_CORE as below: CREATE_REGION flow: 1. Validate request (size, decoder availability) 2. Allocate HPA via cxl_get_hpa_freespace() 3. Allocate DPA via cxl_request_dpa() 4. Create region via cxl_create_region() - commits HDM decoder 5. Get HPA range via cxl_get_region_range() DESTROY_REGION flow: 1. Detach decoder via cxl_decoder_detach() 2. Free DPA via cxl_dpa_free() 3. Release root decoder via cxl_put_root_decoder() Use DEFINE_FREE scope helpers so error paths unwind cleanly. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 119 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 8 ++ 2 files changed, 127 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 02755265d530b..30b365b919034 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,13 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +/* + * Scope-based cleanup wrappers for the CXL resource APIs + */ +DEFINE_FREE(cxl_put_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_put_root_decoder(_T)) +DEFINE_FREE(cxl_dpa_free, struct cxl_endpoint_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_dpa_free(_T)) +DEFINE_FREE(cxl_unregister_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) cxl_unregister_region(_T)) + /* * vfio_cxl_create_device_state - Allocate and validate CXL device state * @@ -165,6 +172,112 @@ static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, return ret; } +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size) +{ + resource_size_t max_size; + + WARN_ON(cxl->precommitted); + + struct cxl_root_decoder *cxlrd __free(cxl_put_root_decoder) = + cxl_get_hpa_freespace(cxl->cxlmd, 1, + CXL_DECODER_F_RAM | CXL_DECODER_F_TYPE2, + &max_size); + if (IS_ERR(cxlrd)) + return PTR_ERR(cxlrd); + + /* Insufficient HPA space; cxlrd freed automatically by __free() */ + if (max_size < size) + return -ENOSPC; + + struct cxl_endpoint_decoder *cxled __free(cxl_dpa_free) = + cxl_request_dpa(cxl->cxlmd, CXL_PARTMODE_RAM, size); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + + struct cxl_region *region __free(cxl_unregister_region) = + cxl_create_region(cxlrd, &cxled, 1); + if (IS_ERR(region)) + return PTR_ERR(region); + + /* All operations succeeded; transfer ownership to cxl state */ + cxl->cxlrd = no_free_ptr(cxlrd); + cxl->cxled = no_free_ptr(cxled); + cxl->region = no_free_ptr(region); + + return 0; +} + +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl) +{ + if (!cxl->region) + return; + + cxl_unregister_region(cxl->region); + cxl->region = NULL; + + if (!cxl->precommitted) { + cxl_dpa_free(cxl->cxled); + cxl_put_root_decoder(cxl->cxlrd); + } + + cxl->cxled = NULL; + cxl->cxlrd = NULL; +} + +static int vfio_cxl_create_region_helper(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + struct pci_dev *pdev = vdev->pdev; + struct range range; + int ret; + + if (cxl->precommitted) { + struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + + cxled = cxl_get_committed_decoder(cxl->cxlmd, ®ion); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + cxl->cxled = cxled; + cxl->region = region; + } else { + ret = vfio_cxl_create_cxl_region(cxl, capacity); + if (ret) + return ret; + } + + if (!cxl->region) { + pci_err(pdev, "Failed to create CXL region\n"); + ret = -ENODEV; + goto failed; + } + + ret = cxl_get_region_range(cxl->region, &range); + if (ret) + goto failed; + + cxl->region_hpa = range.start; + cxl->region_size = range_len(&range); + + pci_dbg(pdev, "CXL region: HPA 0x%llx size %lu MB\n", + cxl->region_hpa, cxl->region_size >> 20); + + return 0; + +failed: + if (cxl->region) { + cxl_unregister_region(cxl->region); + cxl->region = NULL; + } + + cxl->cxled = NULL; + cxl->cxlrd = NULL; + + return ret; +} + static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, resource_size_t capacity) { @@ -279,6 +392,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto regs_failed; } + cxl->precommitted = true; cxl->dpa_size = capacity; pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); @@ -289,6 +403,10 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) goto regs_failed; } + ret = vfio_cxl_create_region_helper(vdev, cxl, capacity); + if (ret) + goto regs_failed; + /* * Register probing succeeded. Assign vdev->cxl now so that * all subsequent helpers can access state via vdev->cxl. @@ -314,6 +432,7 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) return; vfio_cxl_clean_virt_regs(cxl); + vfio_cxl_destroy_cxl_region(cxl); } MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 0fab271c96d68..f036bea946522 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -18,6 +18,10 @@ struct vfio_pci_cxl_state { struct cxl_memdev *cxlmd; struct cxl_root_decoder *cxlrd; struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + resource_size_t region_hpa; + size_t region_size; + void *region_vaddr; resource_size_t hdm_reg_offset; size_t hdm_reg_size; resource_size_t comp_reg_offset; @@ -29,6 +33,7 @@ struct vfio_pci_cxl_state { u8 hdm_count; u8 comp_reg_bar; bool cache_capable; + bool precommitted; }; /* Register access sizes */ @@ -88,5 +93,8 @@ void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); resource_size_t vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, struct vfio_pci_cxl_state *cxl); +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size); +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); #endif /* __LINUX_VFIO_CXL_PRIV_H */ From 05b9195202cc8bdb57b8d284f7d4714ad329a39f Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:11 +0530 Subject: [PATCH 39/51] NVIDIA: VR: SAUCE: vfio/cxl: DPA VFIO region with demand fault mmap and reset zap Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can mmap guest device memory directly. Faults call vmf_insert_pfn() to insert one PFN at a time rather than mapping the full range upfront. CXL region lifecycle: - The CXL memory region is registered with VFIO layer during vfio_pci_open_device - mmap() establishes the VMA with vm_ops but inserts no PTEs - Each guest page fault calls vfio_cxl_region_page_fault() which inserts a single PFN under the memory_lock read side - On device reset, vfio_cxl_zap_region_locked() sets region_active=false and calls unmap_mapping_range() to invalidate all DPA PTEs atomically while holding memory_lock for writing - Faults racing with reset see region_active==false and return VM_FAULT_SIGBUS - vfio_cxl_reactivate_region() restores region_active after successful hardware reset Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so that FLR correctly invalidates DPA mappings and restores them on success. Co-developed-by: Zhi Wang Signed-off-by: Zhi Wang Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve context mismatches in vfio_pci_core.c and vfio_pci_priv.h due to missing upstream dmabuf support in NV-Kernels base] Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 187 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 3 + drivers/vfio/pci/vfio_pci_core.c | 11 ++ drivers/vfio/pci/vfio_pci_priv.h | 6 + 5 files changed, 208 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 30b365b919034..19d3dc205f99e 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -435,4 +435,191 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) vfio_cxl_destroy_cxl_region(cxl); } +static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) +{ + struct vfio_pci_region *region = vmf->vma->vm_private_data; + struct vfio_pci_cxl_state *cxl = region->data; + unsigned long pgoff; + unsigned long pfn; + + if (!READ_ONCE(cxl->region_active)) + return VM_FAULT_SIGBUS; + + pgoff = vmf->pgoff & + ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (pgoff >= (cxl->region_size >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + + pfn = PHYS_PFN(cxl->region_hpa) + pgoff; + + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); +} + +static const struct vm_operations_struct vfio_cxl_region_vm_ops = { + .fault = vfio_cxl_region_vm_fault, +}; + +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u64 req_len, pgoff, end; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + if (end > cxl->region_size) + return -EINVAL; + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + + vma->vm_ops = &vfio_cxl_region_vm_ops; + vma->vm_private_data = region; + + return 0; +} + +/* + * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + + WRITE_ONCE(cxl->region_active, false); +} + +/* + * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset. + * + * Must be called with vdev->memory_lock held for writing. Re-reads the + * HDM decoder state from hardware (FLR cleared it) and sets region_active + * so that subsequent I/O to the region is permitted again. + */ +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + /* + * Re-initialise the emulated HDM comp_reg_virt[] from hardware. + * After FLR the decoder registers read as zero; mirror that in + * the emulated state so QEMU sees a clean slate. + */ + vfio_cxl_reinit_comp_regs(cxl); + + /* + * Only re-enable the DPA mmap if the hardware has actually + * re-committed decoder 0 after FLR. Read the COMMITTED bit from the + * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR + * hardware state, not stale pre-reset state. + * + * If COMMITTED is 0 (slow firmware re-commit path), leave + * region_active=false. Guest faults will return VM_FAULT_SIGBUS + * until the decoder is re-committed and the region is re-enabled. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + /* + * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset + * (now CXL.mem-relative) plus the within-HDM-block offset. + */ + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } +} + +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_cxl_state *cxl = core_dev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count || pos >= cxl->region_size) + return 0; + + /* + * Guard against access after a failed reset (region_active=false) + * or a release race (region_vaddr=NULL). Either condition means + * the memremap'd window is no longer valid; touching it would produce + * a Synchronous External Abort. Return -EIO so the caller gets a + * clean error rather than a kernel oops. + */ + if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr) + return -EIO; + + count = min(count, (size_t)(cxl->region_size - pos)); + + if (iswrite) { + if (copy_from_user(cxl->region_vaddr + pos, buf, count)) + return -EFAULT; + } else { + if (copy_to_user(buf, cxl->region_vaddr + pos, count)) + return -EFAULT; + } + + return count; +} + +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_cxl_state *cxl = region->data; + + /* + * Deactivate the region before removing user mappings so that any + * fault handler racing the release returns VM_FAULT_SIGBUS rather + * than inserting a PFN into an unmapped region. + */ + WRITE_ONCE(cxl->region_active, false); + + if (cxl->region_vaddr) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + } +} + +static const struct vfio_pci_regops vfio_cxl_regops = { + .rw = vfio_cxl_region_rw, + .mmap = vfio_cxl_region_mmap, + .release = vfio_cxl_region_release, +}; + MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 11195e8c21d79..781328a79b439 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -33,7 +33,7 @@ * +0x1c: (reserved) */ -static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) { /* * hdm_off is a byte offset within the HDM decoder block. diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index f036bea946522..6ad9e282c7f7e 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -34,6 +34,7 @@ struct vfio_pci_cxl_state { u8 comp_reg_bar; bool cache_capable; bool precommitted; + bool region_active; }; /* Register access sizes */ @@ -97,4 +98,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, resource_size_t size); void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off); + #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1f293b3835294..9605f79f71956 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1222,6 +1222,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); + /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ + vfio_cxl_zap_region_locked(vdev); + /* * This function can be invoked while the power state is non-D0. If * pci_try_reset_function() has been called while the power state is @@ -1234,6 +1237,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_try_reset_function(vdev->pdev); + + /* + * Re-enable DPA region if reset succeeded; fault handler will + * re-insert PFNs on next access without requiring a new mmap. + */ + if (!ret) + vfio_cxl_reactivate_region(vdev); + up_write(&vdev->memory_lock); return ret; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 61e1a2fd5e5b5..5be3ab1499c72 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -122,6 +122,8 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); #else @@ -129,6 +131,10 @@ static inline void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } static inline void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } #endif /* CONFIG_VFIO_CXL_CORE */ From 1447b99bccc3f51adcf714e47d3461e9586fa308 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:12 +0530 Subject: [PATCH 40/51] NVIDIA: VR: SAUCE: vfio/cxl: Virtualize CXL DVSEC config writes CXL devices have CXL DVSEC registers in the configuration space. Many of them affect the behaviors of the devices, e.g. enabling CXL.io/CXL.mem/CXL.cache. However, these configurations are owned by the host and a virtualization policy should be applied when handling the access from the guest. Introduce the emulation of CXL configuration space to handle the access of the virtual CXL configuration space from the guest. vfio-pci-core already allocates vdev->vconfig as the authoritative virtual config space shadow. Directly use vdev->vconfig: - DVSEC reads return data from vdev->vconfig (already populated by vfio_config_init() via vfio_ecap_init()) - DVSEC writes go through new CXL-aware write handlers that update vdev->vconfig in place - The writable DVSEC registers are marked virtual in vdev->pci_config_map Signed-off-by: Zhi Wang Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve context mismatches in Makefile and vfio_pci_core.h due to missing upstream dmabuf/p2pdma forward declarations in NV-Kernels base] Signed-off-by: Jiandi An --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/cxl/vfio_cxl_config.c | 306 +++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_core.c | 4 +- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 43 +++- drivers/vfio/pci/vfio_pci_config.c | 46 +++- drivers/vfio/pci/vfio_pci_priv.h | 3 + include/linux/vfio_pci_core.h | 7 + include/uapi/cxl/cxl_regs.h | 98 ++++++++ 8 files changed, 498 insertions(+), 11 deletions(-) create mode 100644 drivers/vfio/pci/cxl/vfio_cxl_config.c diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 0a2f8daafe424..21178d9e34849 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o -vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o cxl/vfio_cxl_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c new file mode 100644 index 0000000000000..dee521118dd48 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CXL DVSEC configuration space emulation for vfio-pci. + * + * Integrates into the existing vfio-pci-core ecap_perms[] framework using + * vdev->vconfig as the sole shadow buffer for DVSEC registers. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +static inline u16 _cxlds_get_dvsec(struct vfio_pci_cxl_state *cxl) +{ + return (u16)cxl->cxlds.cxl_dvsec; +} + +/* Helpers to access vdev->vconfig at a DVSEC-relative offset */ +static inline u16 dvsec_virt_read16(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return le16_to_cpu(*(u16 *)(vdev->vconfig + dvsec + off)); +} + +static inline void dvsec_virt_write16(struct vfio_pci_core_device *vdev, + u16 off, u16 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + *(u16 *)(vdev->vconfig + dvsec + off) = cpu_to_le16(val); +} + +static inline u32 dvsec_virt_read32(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return le32_to_cpu(*(u32 *)(vdev->vconfig + dvsec + off)); +} + +static inline void dvsec_virt_write32(struct vfio_pci_core_device *vdev, + u16 off, u32 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + *(u32 *)(vdev->vconfig + dvsec + off) = cpu_to_le32(val); +} + +/* Individual DVSEC register write handlers */ + +static void cxl_dvsec_control_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 lock = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL_RESERVED_MASK; + + if (lock & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; /* register is locked after first write */ + + if (!(cap3 & CXL_DVSEC_CAP3_P2P_MEM_CAPABLE)) + rev_mask |= CXL_CTRL_P2P_REV_MASK; + + new_val &= ~rev_mask; + new_val |= CXL_DVSEC_CTRL_IO_ENABLE; /* IO_Enable always returns 1 */ + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, new_val); +} + +static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS_OFFSET); + + /* + * VIRAL_STATUS (bit 14) is the only writable bit; all others are + * reserved and always zero. + */ + new_val = cur_val & ~(new_val & CXL_DVSEC_STATUS_VIRAL_STATUS); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); +} + +static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_CONTROL2_OFFSET; + u16 cap2 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY2_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL2_RESERVED_MASK; + + if (!(cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY)) + rev_mask |= CXL_CTRL2_VOLATILE_HDM_REV_MASK; + if (!(cap2 & CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE)) + rev_mask |= CXL_CTRL2_MODIFIED_COMP_REV_MASK; + + new_val &= ~rev_mask; + + /* Cache WBI: forward to hardware. */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI) + pci_write_config_word(pdev, abs_off, + CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI); + + /* + * CXL Reset: not yet supported - do not forward to HW. + * TODO: invoke CXL protocol reset via cxl subsystem + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) + pci_warn(pdev, "vfio-cxl: CXL reset requested but not yet supported\n"); + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, + new_val & ~CXL_CTRL2_HW_BITS_MASK); +} + +static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; + + /* RW1CS: write 1 to clear, but only if the capability is supported */ + if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); + /* STATUS2 is not mirrored in vconfig - reads go to hardware */ +} + +static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + + /* Once the LOCK bit is set it can only be cleared by conventional reset */ + if (cur_val & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; + + new_val &= ~CXL_LOCK_RESERVED_MASK; + dvsec_virt_write16(vdev, CXL_DVSEC_LOCK_OFFSET, new_val); +} + +static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, + u16 dvsec_off, u32 new_val) +{ + new_val &= ~CXL_BASE_LO_RESERVED_MASK; + dvsec_virt_write32(vdev, dvsec_off, new_val); +} + +/** + * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to read + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Output buffer for the read value (little-endian) + * + * Called via vfio_pci_dvsec_dispatch_read() for CXL devices. Returns shadow + * vconfig values for virtualized DVSEC registers (CONTROL, STATUS, CONTROL2, + * LOCK) so that userspace reads reflect emulated state rather than raw + * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). + * + * Return: @count on success, or negative error code from the fallback read. + */ +static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 dvsec_off; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_read(vdev, pos, count, perm, offset, val); + + dvsec_off = (u16)pos - dvsec; + + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + case CXL_DVSEC_STATUS_OFFSET: + case CXL_DVSEC_CONTROL2_OFFSET: + case CXL_DVSEC_LOCK_OFFSET: + /* Return shadow vconfig value for virtualized registers */ + memcpy(val, vdev->vconfig + pos, count); + return count; + default: + return vfio_raw_config_read(vdev, pos, count, + perm, offset, val); + } +} + +/** + * vfio_cxl_dvsec_writefn - ecap_perms write handler for PCI_EXT_CAP_ID_DVSEC. + * + * Installed once into ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn by + * vfio_pci_init_perm_bits() when CONFIG_VFIO_CXL_CORE=y. Applies to every + * device opened under vfio-pci; the vdev->cxl NULL check distinguishes CXL + * devices from non-CXL devices that happen to expose a DVSEC capability. + * + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to write + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Value to write (little-endian) + * + * Return: @count on success; non-CXL devices continue to + * vfio_raw_config_write() which also returns @count or negative error. + */ +static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = (u16)pos; + u16 dvsec_off; + u16 wval16; + u32 wval32; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + pci_dbg(vdev->pdev, + "vfio_cxl: DVSEC write: abs=0x%04x dvsec_off=0x%04x count=%d raw_val=0x%08x\n", + abs_off, abs_off - dvsec, count, le32_to_cpu(val)); + + dvsec_off = abs_off - dvsec; + + /* Route to the appropriate per-register handler */ + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control_write(vdev, wval16); + break; + case CXL_DVSEC_STATUS_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status_write(vdev, wval16); + break; + case CXL_DVSEC_CONTROL2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control2_write(vdev, wval16); + break; + case CXL_DVSEC_STATUS2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status2_write(vdev, wval16); + break; + case CXL_DVSEC_LOCK_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_lock_write(vdev, wval16); + break; + case CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET: + case CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET: + wval32 = le32_to_cpu(val); + dvsec_virt_write32(vdev, dvsec_off, wval32); + break; + case CXL_DVSEC_RANGE1_BASE_LOW_OFFSET: + case CXL_DVSEC_RANGE2_BASE_LOW_OFFSET: + wval32 = le32_to_cpu(val); + cxl_range_base_lo_write(vdev, dvsec_off, wval32); + break; + default: + /* RO registers: header, capability, range sizes - discard */ + break; + } + + return count; +} + +/** + * vfio_cxl_setup_dvsec_perms - Install per-device CXL DVSEC read/write hooks. + * @vdev: VFIO PCI core device + * + * Called once per device open after vfio_config_init() has seeded vdev->vconfig + * from hardware. Installs vfio_cxl_dvsec_readfn and vfio_cxl_dvsec_writefn + * as per-device DVSEC handlers so that the global ecap_perms[DVSEC] dispatcher + * routes reads and writes through CXL-aware emulation. + * + * Forces CXL.io IO_ENABLE in the CONTROL vconfig shadow at init time so the + * initial guest read returns the correct value before the first write. + */ +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) +{ + u16 ctrl = dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL_OFFSET); + + vdev->dvsec_readfn = vfio_cxl_dvsec_readfn; + vdev->dvsec_writefn = vfio_cxl_dvsec_writefn; + + /* Force IO_ENABLE; cxl_dvsec_control_write() maintains this invariant. */ + ctrl |= CXL_DVSEC_CTRL_IO_ENABLE; + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, ctrl); +} +EXPORT_SYMBOL_GPL(vfio_cxl_setup_dvsec_perms); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 19d3dc205f99e..a3ff90b7a22c3 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -68,13 +68,13 @@ vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI * sequence is needed before FLR. */ - if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) || + if (!FIELD_GET(CXL_DVSEC_CAP_MEM_CAPABLE, cap_word) || (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { devm_kfree(&pdev->dev, cxl); return ERR_PTR(-ENODEV); } - cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word); + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); return cxl; } diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 6ad9e282c7f7e..522f95a27a22e 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -77,14 +77,43 @@ struct vfio_pci_cxl_state { #define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) /* - * CXL DVSEC for CXL Devices - register offsets within the DVSEC - * (CXL 4.0 8.1.3). - * Offsets are relative to the DVSEC capability base (cxl->dvsec). + * DVSEC register offsets and per-bit hardware definitions are in + * as CXL_DVSEC_*. The masks below encode + * emulation policy: which bits to ignore, which to preserve separately + * from their raw hardware state. */ -#define CXL_DVSEC_CAPABILITY_OFFSET 0xa -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */ -#define CXL_DVSEC_CACHE_CAPABLE BIT(0) +/* DVSEC Control (0x0C): bits 13 (RsvdP) and 15 (RsvdP) are always discarded */ +#define CXL_CTRL_RESERVED_MASK (BIT(13) | BIT(15)) +/* bit 12 (P2P_Mem_Enable) treated as reserved if Cap3.P2P_Mem_Capable=0 */ +#define CXL_CTRL_P2P_REV_MASK CXL_DVSEC_CTRL_P2P_MEM_ENABLE + +/* DVSEC Status (0x0E): bits 13:0 and 15 are RsvdZ */ +#define CXL_STATUS_RESERVED_MASK (GENMASK(13, 0) | BIT(15)) + +/* + * DVSEC Control2 (0x10) emulation masks. + * + * CXL_CTRL2_HW_BITS_MASK: bits 1 (Initiate_Cache_WBI) and 2 + * (Initiate_CXL_Reset) always read 0 from hardware _ they are write-only + * action triggers per CXL 4.0 _8.1.3.8 Table 8-8. Forward these to the + * device to trigger the hardware action; clear them from vconfig shadow so + * that subsequent guest reads return 0 as hardware requires. + * + * NOTE: bit 0 (Disable_Caching) and bit 3 (CXL_Reset_Mem_Clr_Enable) are + * ordinary RW fields _ they must be preserved in vconfig, not forwarded. + */ +#define CXL_CTRL2_RESERVED_MASK GENMASK(15, 6) +#define CXL_CTRL2_HW_BITS_MASK (BIT(1) | BIT(2)) +/* bit 4 is RsvdP if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_CTRL2_VOLATILE_HDM_REV_MASK CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM +/* bit 5 is RsvdP if Cap2.Mod_Completion_Capable=0 */ +#define CXL_CTRL2_MODIFIED_COMP_REV_MASK CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE + +/* DVSEC Lock (0x14): bits 15:1 are RsvdP */ +#define CXL_LOCK_RESERVED_MASK GENMASK(15, 1) + +/* DVSEC Range Base Low: bits 27:0 are reserved per Tables 8-15/8-19 */ +#define CXL_BASE_LO_RESERVED_MASK CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, struct vfio_pci_cxl_state *cxl, diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 41223ce4014b9..a3abaae4c62d8 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1071,6 +1071,49 @@ static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) return 0; } +/* + * vfio_pci_dvsec_dispatch_read - per-device DVSEC read dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn at module init. + * Calls vdev->dvsec_readfn when a shadow-read handler has been registered + * (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), otherwise + * continue to vfio_raw_config_read for hardware pass-through. + * + * This indirection allows per-device DVSEC reads from vconfig shadow + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_read(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + if (vdev->dvsec_readfn) + return vdev->dvsec_readfn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_read(vdev, pos, count, perm, offset, val); +} + +/* + * vfio_pci_dvsec_dispatch_write - per-device DVSEC write dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn at module init. + * Calls vdev->dvsec_writefn when a handler has been registered for this + * device (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), + * otherwise proceed to vfio_raw_config_write so that non-CXL devices + * with a DVSEC capability continue to pass writes to hardware. + * + * This indirection allows per-device DVSEC handlers to be registered + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_write(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + if (vdev->dvsec_writefn) + return vdev->dvsec_writefn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_write(vdev, pos, count, perm, offset, val); +} + /* * Initialize the shared permission tables */ @@ -1107,7 +1150,8 @@ int __init vfio_pci_init_perm_bits(void) ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; - ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn = vfio_pci_dvsec_dispatch_read; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_pci_dvsec_dispatch_write; if (ret) vfio_pci_uninit_perm_bits(); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 5be3ab1499c72..8741dfe959657 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -124,6 +124,7 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); #else @@ -135,6 +136,8 @@ static inline void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } #endif /* CONFIG_VFIO_CXL_CORE */ diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2f0fb18ec1ff3..a8094244846da 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -28,6 +28,7 @@ struct vfio_pci_core_device; struct vfio_pci_region; struct vfio_pci_cxl_state; +struct perm_bits; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -98,6 +99,12 @@ struct vfio_pci_core_device { struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; struct vfio_pci_cxl_state *cxl; + int (*dvsec_readfn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + int (*dvsec_writefn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h index b6fcae91d216b..e9746e75e09ae 100644 --- a/include/uapi/cxl/cxl_regs.h +++ b/include/uapi/cxl/cxl_regs.h @@ -59,4 +59,102 @@ #define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) #define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) +/* + * CXL r4.0 8.1.3: DVSEC for CXL Devices + * + * Register offsets are relative to the DVSEC capability base address, + * as discovered via PCI_EXT_CAP_ID_DVSEC with DVSEC ID 0x0. + * All registers in this section are 16-bit wide. + */ + +/* DVSEC register offsets */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0x0a +#define CXL_DVSEC_CONTROL_OFFSET 0x0c +#define CXL_DVSEC_STATUS_OFFSET 0x0e +#define CXL_DVSEC_CONTROL2_OFFSET 0x10 +#define CXL_DVSEC_STATUS2_OFFSET 0x12 +#define CXL_DVSEC_LOCK_OFFSET 0x14 +#define CXL_DVSEC_CAPABILITY2_OFFSET 0x16 +#define CXL_DVSEC_RANGE1_SIZE_HIGH_OFFSET 0x18 +#define CXL_DVSEC_RANGE1_SIZE_LOW_OFFSET 0x1c +#define CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET 0x20 +#define CXL_DVSEC_RANGE1_BASE_LOW_OFFSET 0x24 +#define CXL_DVSEC_RANGE2_SIZE_HIGH_OFFSET 0x28 +#define CXL_DVSEC_RANGE2_SIZE_LOW_OFFSET 0x2c +#define CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET 0x30 +#define CXL_DVSEC_RANGE2_BASE_LOW_OFFSET 0x34 +#define CXL_DVSEC_CAPABILITY3_OFFSET 0x38 + +/* DVSEC Range Base Low registers: bits [27:0] are reserved */ +#define CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK __GENMASK(27, 0) + +/* CXL r4.0 8.1.3.1 Table 8-5 DVSEC CXL Capability (offset 0x0A) */ +#define CXL_DVSEC_CAP_CACHE_CAPABLE _BITUL(0) +#define CXL_DVSEC_CAP_IO_CAPABLE _BITUL(1) +#define CXL_DVSEC_CAP_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_CAP_MEM_HW_INIT_MODE _BITUL(3) +#define CXL_DVSEC_CAP_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP_CACHE_WBI_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP_CXL_RESET_CAPABLE _BITUL(7) +#define CXL_DVSEC_CAP_CXL_RESET_TIMEOUT_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CAP_CXL_RESET_MEM_CLR_CAPABLE _BITUL(11) +#define CXL_DVSEC_CAP_TSP_CAPABLE _BITUL(12) +#define CXL_DVSEC_CAP_MLD_CAPABLE _BITUL(13) +#define CXL_DVSEC_CAP_VIRAL_CAPABLE _BITUL(14) +#define CXL_DVSEC_CAP_PM_INIT_REPORTING_CAPABLE _BITUL(15) + +/* CXL r4.0 8.1.3.2 Table 8-6 DVSEC CXL Control (offset 0x0C) */ +#define CXL_DVSEC_CTRL_CACHE_ENABLE _BITUL(0) +#define CXL_DVSEC_CTRL_IO_ENABLE _BITUL(1) +#define CXL_DVSEC_CTRL_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_CTRL_CACHE_SF_COVERAGE_MASK __GENMASK(7, 3) +#define CXL_DVSEC_CTRL_CACHE_SF_GRANULARITY_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CTRL_CACHE_CLEAN_EVICTION _BITUL(11) +#define CXL_DVSEC_CTRL_P2P_MEM_ENABLE _BITUL(12) +/* bit 13: RsvdP */ +#define CXL_DVSEC_CTRL_VIRAL_ENABLE _BITUL(14) +/* bit 15: RsvdP */ + +/* CXL r4.0 8.1.3.3 Table 8-7 DVSEC CXL Status (offset 0x0E) */ +/* bits 13:0 = RsvdZ */ +#define CXL_DVSEC_STATUS_VIRAL_STATUS _BITUL(14) +/* bit 15 = RsvdZ */ + +/* CXL r4.0 8.1.3.4 Table 8-8 DVSEC CXL Control2 (offset 0x10) */ +#define CXL_DVSEC_CTRL2_DISABLE_CACHING _BITUL(0) +#define CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI _BITUL(1) +#define CXL_DVSEC_CTRL2_INITIATE_CXL_RESET _BITUL(2) +#define CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE _BITUL(3) +#define CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM _BITUL(4) +#define CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE _BITUL(5) +/* bits 15:6 = RsvdP */ + +/* CXL r4.0 8.1.3.5 Table 8-9 DVSEC CXL Status2 (offset 0x12) */ +#define CXL_DVSEC_STATUS2_CACHE_INVALID _BITUL(0) +#define CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE _BITUL(1) +#define CXL_DVSEC_STATUS2_CXL_RESET_ERROR _BITUL(2) +/* RW1CS; RsvdZ if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR _BITUL(3) +/* bits 14:4 = RsvdZ */ +#define CXL_DVSEC_STATUS2_PM_INIT_COMPLETION _BITUL(15) + +/* CXL r4.0 _8.1.3.6 Table 8-10 _ DVSEC CXL Lock (offset 0x14) */ +#define CXL_DVSEC_LOCK_CONFIG_LOCK _BITUL(0) +/* bits 15:1 = RsvdP */ + +/* CXL r4.0 8.1.3.7 Table 8-11 DVSEC CXL Capability2 (offset 0x16) */ +#define CXL_DVSEC_CAP2_CACHE_SIZE_UNIT_MASK __GENMASK(3, 0) +#define CXL_DVSEC_CAP2_FALLBACK_CAPABILITY_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP2_NO_CLEAN_WRITEBACK _BITUL(7) +#define CXL_DVSEC_CAP2_CACHE_SIZE_MASK __GENMASK(15, 8) + +/* CXL r4.0 8.1.3.14 Table 8-20 DVSEC CXL Capability3 (offset 0x38) */ +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_COLD_RESET _BITUL(0) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_WARM_RESET _BITUL(1) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_HOT_RESET _BITUL(2) +#define CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY _BITUL(3) +#define CXL_DVSEC_CAP3_P2P_MEM_CAPABLE _BITUL(4) +/* bits 15:5 = RsvdP */ + #endif /* _UAPI_CXL_REGS_H_ */ From 24dd6678d476013060f40fe0a59f1ad9ededd97a Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:13 +0530 Subject: [PATCH 41/51] NVIDIA: VR: SAUCE: vfio/cxl: Register regions with VFIO layer Register the DPA and component register region with VFIO layer. Region indices for both these regions are cached for quick lookup. vfio_cxl_register_cxl_region() - memremap(WB) the region HPA (treat CXL.mem as RAM, not MMIO) - Register VFIO_REGION_SUBTYPE_CXL - Records dpa_region_idx. vfio_cxl_register_comp_regs_region() - Registers VFIO_REGION_SUBTYPE_CXL_COMP_REGS with size hdm_reg_offset + hdm_reg_size - Records comp_reg_region_idx. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 98 +++++++++++++++++++++++++++- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 34 ++++++++++ drivers/vfio/pci/cxl/vfio_cxl_priv.h | 2 + drivers/vfio/pci/vfio_pci.c | 23 +++++++ drivers/vfio/pci/vfio_pci_priv.h | 11 ++++ 5 files changed, 167 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index a3ff90b7a22c3..b38a04301660a 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -75,6 +75,8 @@ vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) } cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); + cxl->dpa_region_idx = -1; + cxl->comp_reg_region_idx = -1; return cxl; } @@ -509,14 +511,19 @@ static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, */ void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { + struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = vdev->cxl; lockdep_assert_held_write(&vdev->memory_lock); - if (!cxl) + if (!cxl || cxl->dpa_region_idx < 0) return; WRITE_ONCE(cxl->region_active, false); + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); } /* @@ -601,6 +608,7 @@ static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, struct vfio_pci_region *region) { + struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = region->data; /* @@ -610,6 +618,16 @@ static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, */ WRITE_ONCE(cxl->region_active, false); + /* + * Remove all user mappings of the DPA region while the device is + * still alive. + */ + if (cxl->dpa_region_idx >= 0) + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); + if (cxl->region_vaddr) { memunmap(cxl->region_vaddr); cxl->region_vaddr = NULL; @@ -622,4 +640,82 @@ static const struct vfio_pci_regops vfio_cxl_regops = { .release = vfio_cxl_region_release, }; +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags; + int ret; + + if (!cxl) + return -ENODEV; + + if (!cxl->region || cxl->region_vaddr) + return -ENODEV; + + /* + * CXL device memory is RAM, not MMIO. Use memremap() rather than + * ioremap_cache() so the correct memory-mapping API is used. + * The WB attribute matches the cache-coherent nature of CXL.mem. + */ + cxl->region_vaddr = memremap(cxl->region_hpa, cxl->region_size, + MEMREMAP_WB); + if (!cxl->region_vaddr) + return -ENOMEM; + + flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL, + &vfio_cxl_regops, + cxl->region_size, flags, + cxl); + if (ret) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + return ret; + } + + /* + * Cache the vdev->region[] index before activating the region. + * vfio_pci_core_register_dev_region() placed the new entry at + * vdev->region[num_regions - 1] and incremented num_regions. + * vfio_cxl_zap_region_locked() uses this to avoid scanning + * vdev->region[] on every FLR. + */ + cxl->dpa_region_idx = vdev->num_regions - 1; + + vfio_cxl_reinit_comp_regs(cxl); + + WRITE_ONCE(cxl->region_active, true); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region); + +/** + * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region() + * @vdev: VFIO PCI device + * + * Marks the DPA region inactive and resets dpa_region_idx. + * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd). + * The caller must call vfio_cxl_destroy_cxl_region() separately to release + * those objects. + */ +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + + cxl->dpa_region_idx = -1; +} +EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region); + MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 781328a79b439..50d3718b101d7 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -473,3 +473,37 @@ void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) kfree(cxl->comp_reg_virt); cxl->comp_reg_virt = NULL; } + +/* + * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region. + * + * Exposes the emulated HDM decoder register state as a VFIO device region + * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS. QEMU attaches a + * notify_change callback to this region to intercept HDM COMMIT writes + * and map the DPA MemoryRegion at the appropriate GPA. + * + * The region is read+write only (no mmap) to ensure all accesses pass + * through comp_regs_dispatch_write() for proper bit-field enforcement. + */ +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + int ret; + + if (!cxl || !cxl->comp_reg_virt) + return -ENODEV; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL_COMP_REGS, + &vfio_cxl_comp_regs_ops, + cxl->hdm_reg_offset + + cxl->hdm_reg_size, flags, cxl); + if (!ret) + cxl->comp_reg_region_idx = vdev->num_regions - 1; + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 522f95a27a22e..611ef793006c6 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -29,6 +29,8 @@ struct vfio_pci_cxl_state { __le32 *comp_reg_virt; size_t dpa_size; void __iomem *hdm_iobase; + int dpa_region_idx; + int comp_reg_region_idx; u16 dvsec_len; u8 hdm_count; u8 comp_reg_bar; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index a3e49d42c771b..77e07afe34e8a 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -120,6 +120,29 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) } } + if (vdev->cxl) { + /* + * pci_config_map and vconfig are valid now (allocated by + * vfio_config_init() inside vfio_pci_core_enable() above). + */ + vfio_cxl_setup_dvsec_perms(vdev); + + ret = vfio_cxl_register_cxl_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to setup CXL region\n"); + vfio_pci_core_disable(vdev); + return ret; + } + + ret = vfio_cxl_register_comp_regs_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to register COMP_REGS region\n"); + vfio_cxl_unregister_cxl_region(vdev); + vfio_pci_core_disable(vdev); + return ret; + } + } + vfio_pci_core_finish_enable(vdev); return 0; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 8741dfe959657..1886596b479eb 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -125,6 +125,9 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); #else @@ -138,6 +141,14 @@ static inline void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline void +vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ return 0; } #endif /* CONFIG_VFIO_CXL_CORE */ From 534faac8aa63cec024aa1ef3ab8c8cb22148f476 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:14 +0530 Subject: [PATCH 42/51] NVIDIA: VR: SAUCE: vfio/pci: Advertise CXL cap and sparse component BAR to userspace Expose CXL device capability through the VFIO device info ioctl and give userspace access to the GPU/accelerator register windows in the component BAR while protecting the CXL component register block. vfio_cxl_get_info() fills VFIO_DEVICE_INFO_CAP_CXL with the HDM register BAR index and byte offset, commit flags, and VFIO region indices for the DPA and COMP_REGS regions. HDM decoder count and the HDM block offset within COMP_REGS are not populated; both are derivable from the CXL Capability Array in the COMP_REGS region itself. vfio_cxl_get_region_info() handles VFIO_DEVICE_GET_REGION_INFO for the component register BAR. It builds a sparse-mmap capability that advertises only the GPU/accelerator register windows, carving out the CXL component register block. Three physical layouts are handled: Topology A comp block at BAR end: one area [0, comp_reg_offset) Topology B comp block at BAR start: one area [comp_end, bar_len) Topology C comp block in the middle: two areas, one on each side vfio_cxl_mmap_overlaps_comp_regs() checks whether an mmap request overlaps [comp_reg_offset, comp_reg_offset + comp_reg_size). vfio_pci_core_mmap() calls it to reject access to the component register block while allowing mmap of the GPU register windows in the sparse capability. This replaces the earlier blanket rejection of any mmap on the component BAR index. Hook both helpers into vfio_pci_ioctl_get_info() and vfio_pci_ioctl_get_region_info() in vfio_pci_core.c. The component BAR cannot be claimed exclusively since the CXL subsystem holds persistent sub-range iomem claims during HDM decoder setup. pci_request_selected_regions() returns EBUSY; pass bars=0 to skip the request and map directly via pci_iomap(). Physical ownership is assured by driver binding. Signed-off-by: Zhi Wang Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 155 +++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_core.c | 31 +++++- drivers/vfio/pci/vfio_pci_priv.h | 24 +++++ drivers/vfio/pci/vfio_pci_rdwr.c | 16 ++- 4 files changed, 221 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index b38a04301660a..46430cbfa962e 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -21,6 +21,161 @@ #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl->comp_reg_bar; +} + +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + resource_size_t bar_len, comp_end; + u32 nr_areas, cap_size; + int ret; + + if (!cxl) + return -ENOTTY; + + if (!info) + return -ENOTTY; + + if (info->argsz < minsz) + return -EINVAL; + + if (info->index != cxl->comp_reg_bar) + return -ENOTTY; + + /* + * The device state is not fully initialised; + * fall through to the default BAR handler. + */ + if (!cxl->comp_reg_size) + return -ENOTTY; + + bar_len = pci_resource_len(vdev->pdev, info->index); + comp_end = cxl->comp_reg_offset + cxl->comp_reg_size; + + /* + * Advertise the GPU/accelerator register windows as mmappable by + * carving the CXL component register block out of the BAR. The + * number of sparse areas depends on where the block sits: + * + * [A] comp block at BAR end [gpu_regs | comp_regs]: + * comp_reg_offset > 0 && comp_end == bar_len + * = 1 area: [0, comp_reg_offset) + * + * [B] comp block at BAR start [comp_regs | gpu_regs]: + * comp_reg_offset == 0 && comp_end < bar_len + * = 1 area: [comp_end, bar_len) + * + * [C] comp block in middle [gpu_regs | comp_regs | gpu_regs]: + * comp_reg_offset > 0 && comp_end < bar_len + * = 2 areas: [0, comp_reg_offset) and [comp_end, bar_len) + */ + if (cxl->comp_reg_offset > 0 && comp_end < bar_len) + nr_areas = 2; + else + nr_areas = 1; + + cap_size = struct_size(sparse, areas, nr_areas); + sparse = kzalloc(cap_size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + + if (nr_areas == 2) { + /* [C]: window before and after comp block */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + sparse->areas[1].offset = comp_end; + sparse->areas[1].size = bar_len - comp_end; + } else if (cxl->comp_reg_offset == 0) { + /* [B]: comp block at BAR start, window follows */ + sparse->areas[0].offset = comp_end; + sparse->areas[0].size = bar_len - comp_end; + } else { + /* [A]: comp block at BAR end, window precedes */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + } + + ret = vfio_info_add_capability(caps, &sparse->header, cap_size); + kfree(sparse); + if (ret) + return ret; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return 0; +} + +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl->comp_reg_size) + return false; + + return req_start < cxl->comp_reg_offset + cxl->comp_reg_size && + req_start + req_len > cxl->comp_reg_offset; +} + +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct vfio_device_info_cap_cxl cxl_cap = {0}; + + if (!cxl) + return 0; + + /* + * Device is not fully initialised? + */ + if (WARN_ON(cxl->dpa_region_idx < 0 || cxl->comp_reg_region_idx < 0)) + return -ENODEV; + + /* Fill in from CXL device structure */ + cxl_cap.header.id = VFIO_DEVICE_INFO_CAP_CXL; + cxl_cap.header.version = 1; + /* + * COMP_REGS region starts at comp_reg_offset + CXL_CM_OFFSET within + * the BAR. This is the byte offset of the CXL.mem register area (where + * the CXL Capability Array Header lives) within the component register + * block. Userspace derives hdm_decoder_offset and hdm_count from the + * COMP_REGS region itself (CXL Capability Array traversal + HDMC read). + */ + cxl_cap.hdm_regs_offset = cxl->comp_reg_offset + CXL_CM_OFFSET; + cxl_cap.hdm_regs_bar_index = cxl->comp_reg_bar; + + if (cxl->precommitted) + cxl_cap.flags |= VFIO_CXL_CAP_FIRMWARE_COMMITTED; + if (cxl->cache_capable) + cxl_cap.flags |= VFIO_CXL_CAP_CACHE_CAPABLE; + + /* + * Populate absolute VFIO region indices so userspace can query them + * directly with VFIO_DEVICE_GET_REGION_INFO. + */ + cxl_cap.dpa_region_index = VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx; + cxl_cap.comp_regs_region_index = + VFIO_PCI_NUM_REGIONS + cxl->comp_reg_region_idx; + + return vfio_info_add_capability(caps, &cxl_cap.header, sizeof(cxl_cap)); +} + /* * Scope-based cleanup wrappers for the CXL resource APIs */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9605f79f71956..9be3abd56d63b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -592,7 +592,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; - int i, bar; + int i, bar, bars; /* For needs_reset */ lockdep_assert_held(&vdev->vdev.dev_set->lock); @@ -651,8 +651,10 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) bar = i + PCI_STD_RESOURCES; if (!vdev->barmap[bar]) continue; + bars = (vdev->cxl && i == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); pci_iounmap(pdev, vdev->barmap[bar]); - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); vdev->barmap[bar] = NULL; } @@ -988,6 +990,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->cxl) { + ret = vfio_cxl_get_info(vdev, &caps); + if (ret) + return ret; + info.flags |= VFIO_DEVICE_FLAGS_CXL; + } + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1033,6 +1042,12 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, struct pci_dev *pdev = vdev->pdev; int i, ret; + if (vdev->cxl) { + ret = vfio_cxl_get_region_info(vdev, info, caps); + if (ret != -ENOTTY) + return ret; + } + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); @@ -1760,6 +1775,18 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma if (req_start + req_len > phys_len) return -EINVAL; + /* + * CXL devices: mmap is permitted for the GPU/accelerator register + * windows listed in the sparse-mmap capability. Block any request + * that overlaps the CXL component register block + * [comp_reg_offset, comp_reg_offset + comp_reg_size); those registers + * must be accessed exclusively through the COMP_REGS device region so + * that the emulation layer (notify_change) intercepts every write. + */ + if (vdev->cxl && index == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, req_start, req_len)) + return -EINVAL; + /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 1886596b479eb..f5c46bf3c00a2 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -128,6 +128,14 @@ void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev); +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len); #else @@ -149,6 +157,22 @@ vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } static inline int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) { return 0; } +static inline int +vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline int +vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline u8 +vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ return U8_MAX; } +static inline bool +vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ return false; } #endif /* CONFIG_VFIO_CXL_CORE */ diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 25380b7dfe18a..d816d06ca8c18 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -201,19 +201,29 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { struct pci_dev *pdev = vdev->pdev; - int ret; + int ret, bars; void __iomem *io; if (vdev->barmap[bar]) return 0; - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + /* + * The CXL component register BAR cannot be claimed exclusively: the + * CXL subsystem holds persistent sub-range iomem claims during HDM + * decoder setup. pci_request_selected_regions() for the full BAR + * fails with EBUSY. Pass bars=0 to make the request a no-op and map + * directly via pci_iomap(). + */ + bars = (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); + + ret = pci_request_selected_regions(pdev, bars, "vfio"); if (ret) return ret; io = pci_iomap(pdev, bar, 0); if (!io) { - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); return -ENOMEM; } From 5bc0b3ea82af73c23110b5902c86b3ca9b7d6482 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:15 +0530 Subject: [PATCH 43/51] NVIDIA: VR: SAUCE: vfio/cxl: Provide opt-out for CXL feature This commit provides an opt-out mechanism to disable the CXL support from vfio module. The opt-out is provided both build time and module load time. Build time option CONFIG_VFIO_CXL_CORE is used to enable/disable CXL support in vfio-pci module. For runtime disabling the CXL support, use the module parameter disable_cxl. This is a per-device opt-out on the core device set by the driver before registration. Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) [jan: Resolve context mismatch in vfio_pci.c probe function due to missing upstream pci_ops assignment in NV-Kernels base] Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 4 ++++ drivers/vfio/pci/vfio_pci.c | 9 +++++++++ include/linux/vfio_pci_core.h | 1 + 3 files changed, 14 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 46430cbfa962e..3ffc3e593d043 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -479,6 +479,10 @@ void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) u16 dvsec; int ret; + /* Honor the user opt-out decision */ + if (vdev->disable_cxl) + return; + if (!pcie_is_cxl(pdev)) return; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 77e07afe34e8a..96d1f096eafa6 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -60,6 +60,12 @@ static bool disable_denylist; module_param(disable_denylist, bool, 0444); MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) +static bool disable_cxl; +module_param(disable_cxl, bool, 0444); +MODULE_PARM_DESC(disable_cxl, "Disable CXL Type-2 extensions for all devices bound to vfio-pci. Variant drivers may instead set vdev->disable_cxl in their probe for per-device control without needing this parameter."); +#endif + static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) { switch (pdev->vendor) { @@ -185,6 +191,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + vdev->disable_cxl = disable_cxl; +#endif ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a8094244846da..39b2b354143af 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -88,6 +88,7 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; + bool disable_cxl:1; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; From 646f12a6a8f8fc4e4d01911127b5ddae3e5ea9b9 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 1 Apr 2026 20:09:16 +0530 Subject: [PATCH 44/51] NVIDIA: VR: SAUCE: docs: vfio-pci: Document CXL Type-2 device passthrough Add Documentation/driver-api/vfio-pci-cxl.rst describing the architecture, VFIO interfaces, and operational constraints for CXL Type-2 (cache-coherent accelerator) passthrough via vfio-pci-core, and link it from the driver-api index. The document covers: - VFIO_DEVICE_FLAGS_CXL and VFIO_DEVICE_INFO_CAP_CXL: what the capability struct contains and what the FIRMWARE_COMMITTED and CACHE_CAPABLE flags mean - How to derive hdm_decoder_offset and hdm_count from the COMP_REGS region by traversing the CXL Capability Array to find cap ID 0x5 and reading the HDM Decoder Capability register - Topology-aware sparse mmap on the component BAR (topologies A, B, C covering comp block at end, start, or middle of the BAR) - Two extra VFIO device regions: COMP_REGS for the emulated HDM register state and the DPA memory window - DVSEC config write virtualization: what the guest sees vs. hardware - FLR coordination: DPA PTEs zapped before reset, restored after Signed-off-by: Manish Honap (backported from https://lore.kernel.org/linux-cxl/20260401143917.108413-1-mhonap@nvidia.com/) Signed-off-by: Jiandi An --- Documentation/driver-api/index.rst | 1 + Documentation/driver-api/vfio-pci-cxl.rst | 382 ++++++++++++++++++++++ 2 files changed, 383 insertions(+) create mode 100644 Documentation/driver-api/vfio-pci-cxl.rst diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 3e2a270bd8282..f0c30284befbe 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -47,6 +47,7 @@ of interest to most developers working on device drivers. vfio-mediated-device vfio vfio-pci-device-specific-driver-acceptance + vfio-pci-cxl Bus-level documentation ======================= diff --git a/Documentation/driver-api/vfio-pci-cxl.rst b/Documentation/driver-api/vfio-pci-cxl.rst new file mode 100644 index 0000000000000..1256e4d33fc67 --- /dev/null +++ b/Documentation/driver-api/vfio-pci-cxl.rst @@ -0,0 +1,382 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +VFIO PCI CXL Type-2 device passthrough +======================================= + +Overview +-------- + +Type-2 CXL devices are PCIe accelerators (GPUs, compute ASICs, and similar) +with coherent device memory on CXL.mem. DPA is mapped into host physical +address space through HDM decoders that the kernel's CXL subsystem owns. A +guest cannot program that hardware directly. + +This ``vfio-pci`` mode hands a VMM: + +- A read/write VFIO device region (COMP_REGS) that emulates the HDM decoder + register block with CXL register rules enforced in kernel code. +- A mmapable VFIO device region (DPA) backed by the kernel-chosen host physical + range for device memory. +- DVSEC config-space emulation so the guest cannot change host-owned CXL.io / + CXL.mem enable bits. + +Build with ``CONFIG_VFIO_CXL_CORE=y``. At runtime you can turn it off with:: + + modprobe vfio-pci disable_cxl=1 + +or, in a variant driver, set ``vdev->disable_cxl = true`` before registration. + + +Device detection +---------------- + +At ``vfio_pci_core_register_device()`` the driver checks for a Type-2 style +setup. All of the following must hold: + +1. CXL Device DVSEC present (PCIe DVSEC Vendor ID ``0x1E98``, DVSEC ID + ``0x0000``). +2. ``Mem_Capable`` (bit 2) set in the CXL Capability register inside that DVSEC. +3. PCI class code is **not** ``0x050210`` (CXL Type-3 memory expander). +4. An HDM Decoder capability block reachable through the Register Locator DVSEC. +5. At least one HDM decoder committed by firmware with non-zero size. + +The CXL spec labels "Type-2" as devices with both ``Mem_Capable`` and +``Cache_Capable``. This driver also takes ``Mem_Capable``-only devices +(``Cache_Capable=0``), which behave like Type-3 style accelerators without the +usual class code. ``VFIO_CXL_CAP_CACHE_CAPABLE`` exposes the cache bit to +userspace so a VMM can treat FLR differently when needed. + +When detection succeeds, ``VFIO_DEVICE_FLAGS_CXL`` is ORed into +``vfio_device_info.flags`` together with ``VFIO_DEVICE_FLAGS_PCI``. + +.. note:: + + **Firmware must commit an HDM decoder before open.** The driver only + discovers DPA range and size from a decoder that firmware already committed. + Devices without that, or hot-plugged setups that never get it, are out of + scope for now. + + Follow-up options under discussion include CXL range registers in the + Device DVSEC (often enough on single-decoder parts), CDAT over DOE, mailbox + Get Partition Info, or a future DVSEC field from the consortium for + base/size/NUMA without extra side channels. There is also talk of a sysfs + path, modeled on resizable BAR, where an orchestrator fixes the DPA window + before vfio-pci binds so the driver still sees a committed range. + + +UAPI: VFIO_DEVICE_INFO_CAP_CXL +------------------------------ + +When ``VFIO_DEVICE_FLAGS_CXL`` is set, the device info capability chain +includes a ``vfio_device_info_cap_cxl`` structure (cap ID 6, version 1):: + + struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; /* id=6, version=1 */ + __u8 hdm_regs_bar_index; /* BAR index containing component regs */ + __u8 reserved[3]; + __u32 flags; /* VFIO_CXL_CAP_* flags */ + __u64 hdm_regs_offset; /* byte offset within the BAR to the + * CXL.mem register area start. This + * equals comp_reg_offset + CXL_CM_OFFSET + * where CXL_CM_OFFSET = 0x1000. */ + __u32 dpa_region_index; /* VFIO region index for DPA memory */ + __u32 comp_regs_region_index; /* VFIO region index for COMP_REGS */ + }; + /* + * hdm_count and hdm_decoder_offset are intentionally absent from this + * struct. Both are derivable from the COMP_REGS region. See the + * "Deriving HDM info from COMP_REGS" section below. + */ + + #define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) + #define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + +``VFIO_CXL_CAP_FIRMWARE_COMMITTED`` + At least one HDM decoder was pre-committed by firmware. The DPA region + is live at device open; the VMM can map it without waiting for a guest + COMMIT cycle. + +``VFIO_CXL_CAP_CACHE_CAPABLE`` + The device has an HDM-DB decoder (CXL.mem + CXL.cache). This mirrors the + ``Cache_Capable`` bit from the CXL DVSEC Capability register. The kernel + does not run Write-Back Invalidation (WBI) before FLR; with this flag set + that stays the VMM's job. + +DPA region size comes from ``VFIO_DEVICE_GET_REGION_INFO`` on +``dpa_region_index``, not from this struct. + + +VFIO regions +------------ + +A CXL device adds two device regions on top of the usual BARs. Their indices +are in ``dpa_region_index`` and ``comp_regs_region_index``. + +DPA region (``VFIO_REGION_SUBTYPE_CXL``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Flags: ``READ | WRITE | MMAP``. + +The backing store is the host physical range the kernel assigned for DPA. The +kernel maps it with ``memremap(MEMREMAP_WB)`` because CXL device memory on a +coherent link sits in the CPU cache hierarchy. That mapping is normal cached +memory, so ``copy_to/from_user`` works without extra barriers. + +Page faults are lazy: PFNs are installed per page on first touch via +``vmf_insert_pfn``. ``mmap()`` does not populate the whole region up front. + +Region read/write through the fd uses the same ``MEMREMAP_WB`` mapping with +``copy_to/from_user``. ``ioread``/``iowrite`` MMIO helpers are not used on +this path. + +During FLR, ``unmap_mapping_range()`` drops user PTEs and ``region_active`` +clears before the reset runs. Ongoing faults or region I/O then error instead +of touching a dead mapping. IOMMU ATC invalidation from the zap has to finish +before the device resets; doing it the other way around can leave an SMMU +waiting on a device that no longer responds. + +After reset, the region comes back once ``COMMITTED`` shows up again in fresh +HDM hardware state. The VMM can fault pages in again without a new ``mmap()``. + +COMP_REGS region (``VFIO_REGION_SUBTYPE_CXL_COMP_REGS``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Flags: ``READ | WRITE`` (no mmap). + +Emulated registers for the CXL.mem slice of the component register block: the +CXL Capability Array header at offset 0, then the HDM Decoder capability +starting at ``hdm_decoder_offset`` (the byte offset derived by traversing the +CXL Capability Array — see "Deriving HDM info from COMP_REGS" below). +Region size from ``VFIO_DEVICE_GET_REGION_INFO`` covers the full capability +array prefix plus all HDM decoder blocks. + +Only 32-bit, 32-bit-aligned accesses are allowed. 8- and 16-bit attempts get +``-EINVAL``. + +Offsets below ``hdm_decoder_offset`` return the snapshot from device open. +Writes there are dropped (with a WARN); the capability array stays read-only. + +From ``hdm_decoder_offset`` upward the kernel keeps a shadow +(``comp_reg_virt[]``) and applies field rules: + +- At open, hardware HDM state is snapshotted. For firmware-committed decoders + the LOCK bit is cleared and BASE_HI/BASE_LO are zeroed in the shadow so the + VMM can program guest GPA; the host HPA is not carried in the shadow after + that. +- ``COMMIT`` (bit 9 of CTRL): writing 1 sets ``COMMITTED`` (bit 10) in the + shadow immediately. Real hardware stays committed; the shadow tracks what + the guest should see. +- When LOCK is set, writes to BASE_HI and SIZE_HI are ignored so + firmware-committed values survive. + +Region type identifiers:: + + /* type = PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE */ + #define VFIO_REGION_SUBTYPE_CXL 1 /* DPA memory region */ + #define VFIO_REGION_SUBTYPE_CXL_COMP_REGS 2 /* HDM register shadow */ + + +BAR access +---------- + +``VFIO_DEVICE_GET_REGION_INFO`` for ``hdm_regs_bar_index`` reports the full +BAR size with ``READ | WRITE | MMAP`` flags and a +``VFIO_REGION_INFO_CAP_SPARSE_MMAP`` capability listing the GPU or +accelerator register windows — the mmappable parts of the BAR that do **not** +contain CXL component registers. + +The number of sparse areas depends on where the CXL component register block +``[comp_reg_offset, comp_reg_offset + comp_reg_size)`` sits within the BAR: + +* **Topology A** - component block at BAR end: + ``[gpu_regs | comp_regs]`` → 1 area: ``[0, comp_reg_offset)`` + +* **Topology B** - component block at BAR start: + ``[comp_regs | gpu_regs]`` → 1 area: ``[comp_reg_size, bar_len)`` + +* **Topology C** - component block in middle: + ``[gpu_regs | comp_regs | gpu_regs]`` → 2 areas: + ``[0, comp_reg_offset)`` and ``[comp_reg_offset + comp_reg_size, bar_len)`` + +VMMs **must** iterate all ``nr_areas`` entries; do not assume a single area or +that the first area starts at offset zero. + +The GPU/accelerator register windows listed in the sparse capability **are** +physically mmappable: ``mmap()`` on the VFIO device fd at the corresponding +BAR offset succeeds and yields a host-physical-backed mapping suitable for +KVM stage-2 installation. + +The CXL component register block itself **is not** mmappable. Any ``mmap()`` +request whose range overlaps ``[comp_reg_offset, comp_reg_offset + +comp_reg_size)`` returns ``-EINVAL``; those registers must be accessed through +the ``COMP_REGS`` device region. + + +DVSEC configuration space emulation +----------------------------------- + +With ``CONFIG_VFIO_CXL_CORE=y``, vfio-pci installs a handler for +``PCI_EXT_CAP_ID_DVSEC`` (``0x23``) in the config access table. Non-CXL +devices fall through as before. + +On CXL devices, writes to these DVSEC registers are caught and reflected in +``vdev->vconfig`` (shadow config space): + ++--------------------+--------+--------------------------------------------------+ +| Register | Offset | Emulation | ++====================+========+==================================================+ +| CXL Control | +0x0c | RWL; IO_Enable held at 1; locked when Lock | +| | | bit 0 is set. | ++--------------------+--------+--------------------------------------------------+ +| CXL Status | +0x0e | Bit 14 (Viral_Status) is RW1CS. | ++--------------------+--------+--------------------------------------------------+ +| CXL Control2 | +0x10 | Bits 1 and 2 forwarded to hardware. | ++--------------------+--------+--------------------------------------------------+ +| CXL Status2 | +0x12 | Bit 3 forwarded when Capability3 bit 3 is set. | ++--------------------+--------+--------------------------------------------------+ +| CXL Lock | +0x14 | RWO; once set, Control becomes read-only until | +| | | conventional reset. | ++--------------------+--------+--------------------------------------------------+ +| Range Base Hi/Lo | varies | Stored in vconfig; Base Low [27:0] reserved bits | +| | | cleared on write. | ++--------------------+--------+--------------------------------------------------+ + +Reads return the shadow. Read-only registers (Capability, Size High/Low) are +filled from hardware at open. + + +FLR and reset +------------- + +FLR goes through ``vfio_pci_ioctl_reset()``. The CXL-specific part is: + +1. ``vfio_cxl_zap_region_locked()`` runs under the write side of + ``memory_lock``. It clears ``region_active`` and calls + ``unmap_mapping_range()`` on the DPA inode mapping so user PTEs go away. + Concurrent faults or fd I/O hit the inactive flag and error. IOMMU ATC must + drain before reset (see the DPA region notes above). + +2. After FLR, ``vfio_cxl_reactivate_region()`` reads HDM hardware again into + ``comp_reg_virt[]``. If ``COMMITTED`` is set (common when firmware left the + decoder committed), ``region_active`` turns back on and the VMM can refault + without remapping. + + +Known limitations +----------------- + +**Pre-committed HDM decoder required** + See `Device detection`_ and the note there. + +**CXL hot-plug not supported** + Slots need to be present and programmed by firmware at boot. + +**CXL.cache Write-Back Invalidation not implemented** + For HDM-DB devices (``VFIO_CXL_CAP_CACHE_CAPABLE``), the kernel does not + run WBI before FLR. The VMM must do it and expose Back-Invalidation in the + guest topology where required. + + +VMM integration notes +--------------------- + +For a ``VFIO_CXL_CAP_FIRMWARE_COMMITTED`` device (what works today):: + + /* 1. Get device info and locate the CXL cap */ + vfio_device_get_info(fd, &dinfo); + assert(dinfo.flags & VFIO_DEVICE_FLAGS_CXL); + cxl = find_cap(&dinfo, VFIO_DEVICE_INFO_CAP_CXL); + + /* 2. Get DPA and COMP_REGS region sizes */ + get_region_info(fd, cxl->dpa_region_index, &dpa_ri); + get_region_info(fd, cxl->comp_regs_region_index, &comp_ri); + + /* 3. Map DPA region at a guest physical address */ + gpa_base = allocate_guest_phys(dpa_ri.size); + mmap(gpa_base, dpa_ri.size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, vfio_fd, + (off_t)cxl->dpa_region_index << VFIO_PCI_OFFSET_SHIFT); + + /* 4. Derive hdm_decoder_offset from COMP_REGS (see section below) */ + uint64_t hdm_decoder_offset = derive_hdm_offset(vfio_fd, comp_ri); + + /* 5. Write guest GPA into HDM Decoder 0 BASE via COMP_REGS pwrite */ + u32 base_hi = gpa_base >> 32; + comp_off = (off_t)cxl->comp_regs_region_index << VFIO_PCI_OFFSET_SHIFT; + pwrite(vfio_fd, &base_hi, 4, + comp_off + hdm_decoder_offset + CXL_HDM_DECODER0_BASE_HIGH_OFFSET); + + /* 6. Build guest CXL topology using gpa_base and dpa_ri.size */ + build_cfmws(gpa_base, dpa_ri.size); + + /* 7. If CACHE_CAPABLE: issue WBI before any guest FLR */ + +Extra detail: + +- DPA size is ``dpa_ri.size`` from region info. +- ``CXL_HDM_DECODER0_BASE_HIGH_OFFSET`` lives in ``include/uapi/cxl/cxl_regs.h``. +- On the BAR, ``mmaps[0].size`` from the sparse-mmap cap on + ``hdm_regs_bar_index`` splits GPU MMIO (BAR fd) from the CXL block (COMP_REGS + region). +- If ``VFIO_CXL_CAP_CACHE_CAPABLE`` is set, the guest CXL topology should + advertise Back-Invalidation and the VMM should run WBI before FLR. + + +Deriving HDM info from COMP_REGS +--------------------------------- + +``hdm_decoder_offset`` and ``hdm_count`` are not in ``vfio_device_info_cap_cxl`` +because both are directly readable from the ``COMP_REGS`` region. + +**Finding hdm_decoder_offset:** + +Read dwords from the COMP_REGS region starting at offset 0 (the CXL Capability +Array). ``comp_off`` is the VFIO file offset for the COMP_REGS region: +``(off_t)cxl->comp_regs_region_index << VFIO_PCI_OFFSET_SHIFT``:: + + /* Dword 0: CXL Capability Array Header */ + pread(fd, &hdr, 4, comp_off + 0); + /* bits[15:0] must be 1 (CM_CAP_HDR_CAP_ID) */ + /* bits[31:24] = number of capability entries */ + num_caps = (hdr >> 24) & 0xff; /* CXL_CM_CAP_HDR_ARRAY_SIZE_MASK */ + + /* Walk entries at dword 1..num_caps */ + for (i = 1; i <= num_caps; i++) { + pread(fd, &entry, 4, comp_off + i * 4); + cap_id = entry & 0xffff; /* CXL_CM_CAP_HDR_ID_MASK */ + if (cap_id == 0x5) { /* CXL_CM_CAP_CAP_ID_HDM */ + hdm_decoder_offset = (entry >> 20) & 0xfff; /* CXL_CM_CAP_PTR_MASK */ + break; + } + } + +**Finding hdm_count:** + +Read the HDM Decoder Capability register (HDMC) at ``hdm_decoder_offset + 0``:: + + pread(fd, &hdmc, 4, comp_off + hdm_decoder_offset); + field = hdmc & 0xf; /* CXL_HDM_DECODER_COUNT_MASK bits[3:0] */ + hdm_count = field ? field * 2 : 1; /* 0→1, N→N*2 decoders */ + +All constants are in ``include/uapi/cxl/cxl_regs.h``. + + +Kernel configuration +-------------------- + +``CONFIG_VFIO_CXL_CORE`` (bool) + CXL Type-2 passthrough in ``vfio-pci-core``. Needs ``CONFIG_VFIO_PCI_CORE``, + ``CONFIG_CXL_BUS``, and ``CONFIG_CXL_MEM``. + +References +---------- + +* CXL Specification 4.0, 8.1.3 - PCIe DVSEC for CXL Devices +* CXL Specification 4.0, 8.2.4.20 - CXL HDM Decoder Capability Structure +* ``include/uapi/linux/vfio.h`` - ``VFIO_DEVICE_INFO_CAP_CXL``, + ``VFIO_REGION_SUBTYPE_CXL``, ``VFIO_REGION_SUBTYPE_CXL_COMP_REGS`` +* ``include/uapi/cxl/cxl_regs.h`` - ``CXL_CM_OFFSET``, + ``CXL_CM_CAP_HDR_ARRAY_SIZE_MASK``, ``CXL_CM_CAP_HDR_ID_MASK``, + ``CXL_CM_CAP_PTR_MASK``, ``CXL_HDM_DECODER_COUNT_MASK``, + ``CXL_HDM_DECODER0_BASE_HIGH_OFFSET`` From d53532800534c7c4e45d1b11f39265bd0db53700 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 08:44:02 +0530 Subject: [PATCH 45/51] NVIDIA: VR: SAUCE: cxl: Export the CXL reset helpers for VFIO users Export two helpers for VFIO: - pci_cxl_reset_capable() - cxl_dev_reset() The change does not alter the reset flow itself, the capability checks, or the sysfs ABI. It only lifts the helper out of the private path so later VFIO patches can call the same code. Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/cxl/core/pci.c | 20 ++++++++++++++++---- include/cxl/pci.h | 2 ++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 873d86341b54c..2b0e368a174d1 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1237,7 +1237,7 @@ static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) /* * CXL device reset execution */ -static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en) { static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; u16 cap, ctrl2, status2; @@ -1307,7 +1307,17 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) if (rc) return rc; - ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + /* + * Explicitly set or clear RST_MEM_CLR_EN rather than only + * setting it. A previous reset may have left the bit set in + * hardware; if mem_clr_en is false we must clear it so that a + * stale bit does not cause an unwanted memory-clearing reset. + */ + if (mem_clr_en) + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + else + ctrl2 &= ~PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); if (rc) @@ -1356,6 +1366,7 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset, "CXL"); static int match_memdev_by_parent(struct device *dev, const void *parent) { @@ -1395,7 +1406,7 @@ static int cxl_do_reset(struct pci_dev *pdev) pci_dev_save_and_disable(pdev); cxl_pci_functions_reset_prepare(&ctx); - rc = cxl_dev_reset(pdev, dvsec); + rc = cxl_dev_reset(pdev, dvsec, true); cxl_pci_functions_reset_done(&ctx); @@ -1424,7 +1435,7 @@ static int cxl_do_reset(struct pci_dev *pdev) * devices under bus core serialization. */ -static bool pci_cxl_reset_capable(struct pci_dev *pdev) +bool pci_cxl_reset_capable(struct pci_dev *pdev) { int dvsec; u16 cap; @@ -1443,6 +1454,7 @@ static bool pci_cxl_reset_capable(struct pci_dev *pdev) return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); } +EXPORT_SYMBOL_NS_GPL(pci_cxl_reset_capable, "CXL"); static ssize_t cxl_reset_store(struct device *dev, struct device_attribute *attr, diff --git a/include/cxl/pci.h b/include/cxl/pci.h index e5c018da0e1f5..dd1136be250d0 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -28,5 +28,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, int cxl_await_range_active(struct cxl_dev_state *cxlds); int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, resource_size_t *bar_offset); +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); int cxl_setup_regs(struct cxl_register_map *map); #endif From e5183b49c784b48ce48cd296d1ddb8803db45e4f Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 10:40:39 +0530 Subject: [PATCH 46/51] NVIDIA: VR: SAUCE: vfio/pci: Wire CXL DPA reset handling This change adds/renames the vfio-cxl code nuggets to better suite the cxl-reset handling mechanism in later patches. - Rename the CXL DPA region helpers to prepare_reset() and finish_reset so call sites read as a matched pair around pci_try_reset_function Also call prepare_reset()/finish_reset() around pci_try_reset_function() in both the PCIe BCR FLR path and the Function FLR path, matching the logic already used on the VFIO_DEVICE_RESET ioctl path. - When pci_try_reset_function() fails: finish_reset() consults the hardware COMMITTED state before re-enabling the DPA mapping, so it is safe on error and avoids leaving the DPA region wedged off after a transient reset failure. - Add vfio_cxl_reset_capable(), a small wrapper over pci_cxl_reset_capable() Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 23 ++++++++++++++--------- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +- drivers/vfio/pci/vfio_pci_config.c | 4 ++++ drivers/vfio/pci/vfio_pci_core.c | 9 ++++----- drivers/vfio/pci/vfio_pci_priv.h | 12 ++++++++---- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 3ffc3e593d043..d4fdbfa41df51 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -660,15 +660,20 @@ static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, return 0; } +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl && pci_cxl_reset_capable(vdev->pdev); +} + /* - * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs. + * vfio_cxl_prepare_reset - Invalidate all DPA region PTEs. * * Must be called with vdev->memory_lock held for writing. Sets * region_active=false before zapping so any subsequent I/O to the region * sees the inactive state and returns an error rather than accessing * stale mappings. */ -void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) { struct vfio_device *core_vdev = &vdev->vdev; struct vfio_pci_cxl_state *cxl = vdev->cxl; @@ -686,13 +691,13 @@ void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) } /* - * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset. + * vfio_cxl_finish_reset - Re-enable DPA region after reset. * * Must be called with vdev->memory_lock held for writing. Re-reads the - * HDM decoder state from hardware (FLR cleared it) and sets region_active - * so that subsequent I/O to the region is permitted again. + * HDM decoder state from hardware and sets region_active so that + * subsequent I/O to the region is permitted again. */ -void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { struct vfio_pci_cxl_state *cxl = vdev->cxl; @@ -702,8 +707,8 @@ void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) return; /* * Re-initialise the emulated HDM comp_reg_virt[] from hardware. - * After FLR the decoder registers read as zero; mirror that in - * the emulated state so QEMU sees a clean slate. + * A reset clears decoder registers; mirror that in the emulated + * state so the guest device manager sees the post-reset hardware. */ vfio_cxl_reinit_comp_regs(cxl); @@ -842,7 +847,7 @@ int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) * Cache the vdev->region[] index before activating the region. * vfio_pci_core_register_dev_region() placed the new entry at * vdev->region[num_regions - 1] and incremented num_regions. - * vfio_cxl_zap_region_locked() uses this to avoid scanning + * vfio_cxl_prepare_reset() uses this to avoid scanning * vdev->region[] on every FLR. */ cxl->dpa_region_idx = vdev->num_regions - 1; diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 50d3718b101d7..9a96fd7604cbb 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -408,7 +408,7 @@ vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, } /* - * Called with memory_lock write side held (from vfio_cxl_reactivate_region). + * Called with memory_lock write side held (from vfio_cxl_finish_reset). * Uses the pre-established hdm_iobase, no ioremap() under the lock, * which would deadlock on PREEMPT_RT where ioremap() can sleep. */ diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index a3abaae4c62d8..548e9fbee5274 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -901,7 +901,9 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); } } @@ -983,7 +985,9 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9be3abd56d63b..9e8febe33d2cc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1238,7 +1238,7 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ - vfio_cxl_zap_region_locked(vdev); + vfio_cxl_prepare_reset(vdev); /* * This function can be invoked while the power state is non-D0. If @@ -1254,11 +1254,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, ret = pci_try_reset_function(vdev->pdev); /* - * Re-enable DPA region if reset succeeded; fault handler will - * re-insert PFNs on next access without requiring a new mmap. + * finish_reset checks the COMMITTED bit from hardware + * and only brings the region back if it is actually set. */ - if (!ret) - vfio_cxl_reactivate_region(vdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index f5c46bf3c00a2..625d42c4ea336 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -122,8 +122,9 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); -void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); -void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev); +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev); void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); @@ -143,10 +144,13 @@ static inline void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } static inline void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline bool +vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ return false; } static inline void -vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } +vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) { } static inline void -vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } +vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { } static inline void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } static inline int From 8c92d195d1d0f8bf7227cf550f3776bb4ebc15e8 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 29 Apr 2026 22:48:49 +0530 Subject: [PATCH 47/51] NVIDIA: VR: SAUCE: vfio/cxl: Ensure PCI Memory Space is enabled before post-reset BAR access A reset caller may disable Memory Space to quiesce device DMA before issuing the reset. pci_try_reset_function() saves and restores PCI_COMMAND around the FLR. If the memory space was disabled before FLR, it will be restored in disabled state. vfio_cxl_finish_reset() reads HDM decoder registers through the component register BAR immediately after reset. Accessing a BAR with Memory Space disabled produces an Unsupported Request completion; on platforms that promote UR to a fatal error this triggers DPC. Add vfio_cxl_enable_memory_space() and call it at the start of vfio_cxl_finish_reset() before touching any BAR. Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index d4fdbfa41df51..2a0183767610a 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -690,6 +690,27 @@ void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) cxl->region_size, true); } +/* + * vfio_cxl_enable_memory_space - ensure PCI Memory Space is on before BAR reads. + * + * A reset caller may disable Memory Space to quiesce device DMA before + * issuing the reset. If a guest wrote PCI_COMMAND with Memory Space cleared + * before the FLR; pci_dev_save_and_disable() will capture it disabled and + * restores it that way. This can result in Memory Space remains disabled on + * return. Accessing a BAR with Memory Space disabled produces an Unsupported + * Request completion; on platforms that promote UR to a fatal error this fires + * DPC. + */ +static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) +{ + u16 cmd; + + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(vdev->pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); +} + /* * vfio_cxl_finish_reset - Re-enable DPA region after reset. * @@ -705,6 +726,9 @@ void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) if (!cxl) return; + + vfio_cxl_enable_memory_space(vdev); + /* * Re-initialise the emulated HDM comp_reg_virt[] from hardware. * A reset clears decoder registers; mirror that in the emulated From 15ef3e96a36c1e44e7028e579c04e653fd250951 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Wed, 29 Apr 2026 22:50:38 +0530 Subject: [PATCH 48/51] NVIDIA: VR: SAUCE: vfio/cxl: preserve HDM decoder base addresses across reset reinit_comp_regs() mirrors post-reset hardware state (all-zeros) into comp_reg_virt[], including HDM decoder BASE registers. For decoders that the device manager committed with a guest-physical address before the reset, pci_dev_restore() re-commits the hardware decoders with the host-physical base. The kernel provides no notification that BASE was cleared during reinit, so the emulated GPA bases are silently lost. Add vfio_cxl_reinit_hdm_shadow() which snapshots the GPA decoder bases before calling reinit_comp_regs() and restores them after, keeping the emulated decoder consistent with what the device manager set. Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 47 +++++++++++++++++++++++++--- drivers/vfio/pci/cxl/vfio_cxl_emu.c | 38 ++++++++++++++++------ 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 2a0183767610a..d9b349225df92 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -711,6 +711,46 @@ static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) cmd | PCI_COMMAND_MEMORY); } +/* + * vfio_cxl_reinit_hdm_shadow - reinitialise comp_reg_virt, preserving GPA bases. + * + * reinit_comp_regs() mirrors post-reset hardware state (all-zeros) into + * comp_reg_virt[], including the HDM decoder BASE registers. For decoders + * that the device manager committed with a guest-physical address before the + * reset, pci_dev_restore() will re-commit the hardware decoders with the + * host-physical base. The kernel provides no notification that BASE was + * cleared during reinit. Snapshot the GPA bases before reinit and restore + * them after so the emulated decoder remains consistent with what the device + * manager set. + * + * Called with memory_lock write side held (from vfio_cxl_finish_reset). + */ +static void vfio_cxl_reinit_hdm_shadow(struct vfio_pci_cxl_state *cxl) +{ + __le32 saved_lo[16] = {}, saved_hi[16] = {}; + u8 n, count = min_t(u8, cxl->hdm_count, ARRAY_SIZE(saved_lo)); + + if (cxl->comp_reg_virt) { + for (n = 0; n < count; n++) { + saved_lo[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)); + saved_hi[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)); + } + } + + vfio_cxl_reinit_comp_regs(cxl); + + if (cxl->comp_reg_virt) { + for (n = 0; n < count; n++) { + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)) = saved_lo[n]; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)) = saved_hi[n]; + } + } +} + /* * vfio_cxl_finish_reset - Re-enable DPA region after reset. * @@ -730,11 +770,10 @@ void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) vfio_cxl_enable_memory_space(vdev); /* - * Re-initialise the emulated HDM comp_reg_virt[] from hardware. - * A reset clears decoder registers; mirror that in the emulated - * state so the guest device manager sees the post-reset hardware. + * Re-initialise the emulated HDM comp_reg_virt[] from hardware, + * preserving the GPA decoder bases set by the device manager. */ - vfio_cxl_reinit_comp_regs(cxl); + vfio_cxl_reinit_hdm_shadow(cxl); /* * Only re-enable the DPA mmap if the hardware has actually diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 9a96fd7604cbb..250407e8bf701 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -251,6 +251,7 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, { struct vfio_pci_cxl_state *cxl = vdev->cxl; loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret = 0; size_t done = 0; if (!count) @@ -262,14 +263,26 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, count = min(count, (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + /* + * Serialise against vfio_cxl_reinit_hdm_shadow(), which holds + * memory_lock write-side while it saves, zeroes, and restores + * comp_reg_virt[] during reset. Without this read lock a concurrent + * COMP_REGS write can land between the save snapshot and the restore, + * causing the restore to silently overwrite it. A concurrent read + * can observe the array mid-rebuild. + */ + down_read(&vdev->memory_lock); + while (done < count) { u32 sz = count - done; u32 off = pos + done; __le32 v; /* Enforce exactly 4-byte, 4-byte-aligned accesses */ - if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) - return done ? (ssize_t)done : -EINVAL; + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) { + ret = done ? (ssize_t)done : -EINVAL; + goto out_unlock; + } if (iswrite) { if (off < cxl->hdm_reg_offset) { @@ -277,22 +290,29 @@ static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, done += sizeof(v); continue; } - if (copy_from_user(&v, buf + done, sizeof(v))) - return done ? (ssize_t)done : -EFAULT; + if (copy_from_user(&v, buf + done, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } comp_regs_dispatch_write(vdev, off - cxl->hdm_reg_offset, &v, sizeof(v)); } else { - /* Read from extended buffer _ covers cap array and HDM */ + /* Read from extended buffer - covers cap array and HDM */ v = cxl->comp_reg_virt[off / sizeof(__le32)]; - if (copy_to_user(buf + done, &v, sizeof(v))) - return done ? (ssize_t)done : -EFAULT; + if (copy_to_user(buf + done, &v, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } } done += sizeof(v); } + ret = done; *ppos += done; - return done; +out_unlock: + up_read(&vdev->memory_lock); + return ret; } static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, @@ -408,7 +428,7 @@ vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, } /* - * Called with memory_lock write side held (from vfio_cxl_finish_reset). + * Called with memory_lock write side held (from vfio_cxl_reinit_hdm_shadow). * Uses the pre-established hdm_iobase, no ioremap() under the lock, * which would deadlock on PREEMPT_RT where ioremap() can sleep. */ From 37fca8563f93d8ddd8cb62c9f3acb6c28844d688 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 00:28:10 +0530 Subject: [PATCH 49/51] NVIDIA: VR: SAUCE: vfio/cxl: virtualize DVSEC STATUS2 register in vconfig shadow STATUS2 was read directly from hardware while all other DVSEC registers were served from the vconfig shadow. This created two problems: 1. VOLATILE_HDM_PRES_ERROR (RW1CS, bit 3): guest writes cleared the hardware bit but the shadow was not updated, so subsequent reads still returned the set bit from hardware (which the hardware had cleared). 2. CXL_RESET_COMPLETE and CXL_RESET_ERROR (bits 1-2): these outcome bits will be written by vfio_cxl_reset() into the shadow after a protocol reset. Hardware does not update them on its own; serving reads from hardware would hide the outcome from the guest. Add STATUS2 to the read switch so reads come from the shadow, and update cxl_dvsec_status2_write() to mirror VOLATILE_HDM_PRES_ERROR clears into the shadow after forwarding to hardware. Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_config.c | 81 ++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c index dee521118dd48..679b10329e82f 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_config.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -10,6 +10,7 @@ #include #include +#include #include "../vfio_pci_priv.h" #include "vfio_cxl_priv.h" @@ -126,12 +127,24 @@ static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, u16 dvsec = _cxlds_get_dvsec(vdev->cxl); u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; - /* RW1CS: write 1 to clear, but only if the capability is supported */ + /* + * VOLATILE_HDM_PRES_ERROR (bit 3) is RW1CS. Forward to hardware, + * then mirror the clear into vconfig. Reads come from the shadow + * now, so skipping the update leaves the bit stuck from the guest's + * view. + * + * All other STATUS2 bits are RO hardware outputs; ignore guest writes. + */ if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && - (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) { + u16 v; + pci_write_config_word(vdev->pdev, abs_off, CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); - /* STATUS2 is not mirrored in vconfig - reads go to hardware */ + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } } static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, @@ -154,6 +167,30 @@ static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, dvsec_virt_write32(vdev, dvsec_off, new_val); } +/* + * status2_hw_shadow_merge - read STATUS2, merging hardware and vconfig shadow. + * + * RESET_COMPLETE and RESET_ERROR are written into vconfig by vfio_cxl_reset() + * after a protocol reset; pci_dev_restore() clears them from hardware, so they + * must survive in the shadow for a polling guest to see the reset outcome. + * + * All other STATUS2 bits are live hardware outputs and must come from hardware. + * In particular, CACHE_INVALID (bit 0) is polled by guests during a standalone + * write-back invalidation. + * + * @abs_pos: absolute PCI config space byte offset of the STATUS2 register. + */ +static u16 status2_hw_shadow_merge(struct vfio_pci_core_device *vdev, int abs_pos) +{ + const u16 shadow_mask = CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + u16 hw = 0, virt; + + pci_read_config_word(vdev->pdev, abs_pos, &hw); + virt = get_unaligned_le16(vdev->vconfig + abs_pos); + return (hw & ~shadow_mask) | (virt & shadow_mask); +} + /** * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. * @vdev: VFIO PCI core device @@ -168,6 +205,10 @@ static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, * LOCK) so that userspace reads reflect emulated state rather than raw * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). * + * A 4-byte (DWORD) access at the CONTROL2 offset spans both CONTROL2 and + * STATUS2 since CONTROL2 is DWORD-aligned and the two registers are adjacent. + * In that case STATUS2 is returned via the hardware-merge path. + * * Return: @count on success, or negative error code from the fallback read. */ static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, @@ -188,11 +229,32 @@ static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, switch (dvsec_off) { case CXL_DVSEC_CONTROL_OFFSET: case CXL_DVSEC_STATUS_OFFSET: - case CXL_DVSEC_CONTROL2_OFFSET: case CXL_DVSEC_LOCK_OFFSET: - /* Return shadow vconfig value for virtualized registers */ + /* Fully virtualised; return shadow. */ memcpy(val, vdev->vconfig + pos, count); return count; + case CXL_DVSEC_CONTROL2_OFFSET: + if (count == 4) { + /* + * A 4-byte access at the DWORD-aligned CONTROL2 offset + * spans both CONTROL2 (low 16 bits) and STATUS2 (high 16 + * bits). Return CONTROL2 from vconfig and STATUS2 via the + * hardware-merge path so that CACHE_INVALID is fresh. + */ + __le32 combined = cpu_to_le32( + (u32)get_unaligned_le16(vdev->vconfig + pos) | + ((u32)status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET) << 16)); + memcpy(val, &combined, 4); + } else { + memcpy(val, vdev->vconfig + pos, count); + } + return count; + case CXL_DVSEC_STATUS2_OFFSET: { + __le16 result = cpu_to_le16(status2_hw_shadow_merge(vdev, pos)); + memcpy(val, &result, count); + return count; + } default: return vfio_raw_config_read(vdev, pos, count, perm, offset, val); @@ -253,6 +315,15 @@ static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, case CXL_DVSEC_CONTROL2_OFFSET: wval16 = (u16)le32_to_cpu(val); cxl_dvsec_control2_write(vdev, wval16); + if (count == 4) { + /* + * High half of a 32-bit write at CONTROL2 is STATUS2. + * Forward to the STATUS2 handler so RW1CS bits (e.g. + * VOLATILE_HDM_PRES_ERROR) are not silently dropped. + */ + wval16 = (u16)(le32_to_cpu(val) >> 16); + cxl_dvsec_status2_write(vdev, wval16); + } break; case CXL_DVSEC_STATUS2_OFFSET: wval16 = (u16)le32_to_cpu(val); From e8c83314f4beb47e5c58551c273bfbdacafa1456 Mon Sep 17 00:00:00 2001 From: Manish Honap Date: Thu, 30 Apr 2026 11:56:01 +0530 Subject: [PATCH 50/51] NVIDIA: VR: SAUCE: vfio/cxl: Implement vfio_cxl_reset() Add vfio_cxl_reset() to drive a CXL protocol reset on behalf of a guest. Unlike cxl_do_reset(), this path skips host memory offlining since the DPA region is guest memory. The function takes memory_lock for the full sequence, calls vfio_cxl_prepare_reset() to zap DPA region PTEs, drives the hardware via pci_dev_save_and_disable() + cxl_dev_reset() + pci_dev_restore(), then calls vfio_cxl_finish_reset() to reinitialise emulated state. STATUS2 outcome bits (CXL_RESET_COMPLETE / CXL_RESET_ERROR) are written back to vconfig after the reset so the guest can poll for result without reading hardware. pci_dev_restore() overwrites the saved pre-reset state, so the hardware value is re-read after restore before the outcome is stamped. When the guest writes INIT_CXL_RST into DVSEC CONTROL2, invoke vfio_cxl_reset() to perform a CXL protocol reset. The bit is not forwarded to hardware; cxl_dev_reset() drives the reset sequence directly. Silently drop writes on devices that do not advertise RST_CAPABLE to avoid log noise for the reserved-bit case. Signed-off-by: Manish Honap Signed-off-by: Jiandi An --- drivers/vfio/pci/cxl/vfio_cxl_config.c | 101 +++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c index 679b10329e82f..5d13b5d5bc5b8 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_config.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -87,6 +87,80 @@ static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); } +/** + * vfio_cxl_reset - Service a guest CXL protocol reset. + * @vdev: VFIO PCI core device + * + * Unlike cxl_do_reset(), no host memory offlining is performed: the DPA + * region is guest memory, not host RAM. + * + * memory_lock is held for the entire sequence so neither BAR nor DPA + * mappings can fault back in. INIT_CXL_RST is not forwarded to hardware; + * cxl_dev_reset() drives the state machine directly. + * + * STATUS2 outcome bits are written back to vconfig on return so that the + * guest can poll for completion without going to hardware. + * + * Return: 0 on success, negative error code on failure. + */ +static int vfio_cxl_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(cxl); + u16 hw_status2 = 0; + int ret; + + vfio_pci_zap_and_down_write_memory_lock(vdev); + + /* + * CXL r4.0 Table 8-9: device must clear CXL_Reset_Complete before + * starting the reset flow, on the 0->1 transition of Initiate_CXL_Reset. + * Clear both reset outcome bits so a polling guest sees an unambiguous + * in-progress state rather than a stale result from a prior attempt. + */ + { + u16 s = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + + s &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, s); + } + + vfio_cxl_prepare_reset(vdev); + + pci_dev_lock(pdev); + + pci_dev_save_and_disable(pdev); + ret = cxl_dev_reset(pdev, cxl->cxlds.cxl_dvsec, + !!(dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL2_OFFSET) & + CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE)); + pci_dev_restore(pdev); + + pci_dev_unlock(pdev); + + vfio_cxl_finish_reset(vdev); + + /* + * Re-read STATUS2 from hardware after restore. pci_dev_restore() + * writes back the pre-reset saved state, which has both outcome bits + * cleared. Re-reading also picks up genuine hardware changes (e.g. + * VOLATILE_HDM_PRES_ERROR clearing) before stamping in the outcome. + */ + pci_read_config_word(pdev, dvsec + CXL_DVSEC_STATUS2_OFFSET, + &hw_status2); + hw_status2 &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + if (ret) + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + else + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, hw_status2); + + up_write(&vdev->memory_lock); + return ret; +} + static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, u16 new_val) { @@ -110,14 +184,31 @@ static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI); /* - * CXL Reset: not yet supported - do not forward to HW. - * TODO: invoke CXL protocol reset via cxl subsystem + * Commit the new CONTROL2 value to the shadow before triggering a + * reset. vfio_cxl_reset() reads Mem_Clr_Enable (bit 3) from the + * shadow; if the shadow is written after the reset call, a guest write + * that changes bit 3 in the same access as INITIATE_CXL_RESET would + * reset with the stale bit 3 value instead of the one just written. */ - if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) - pci_warn(pdev, "vfio-cxl: CXL reset requested but not yet supported\n"); - dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, new_val & ~CXL_CTRL2_HW_BITS_MASK); + + /* + * INIT_CXL_RST: not forwarded to hardware. cxl_dev_reset() drives + * the state machine; forwarding it after the reset would fire a + * second one. Drop writes on non-RST_CAPABLE devices silently; the + * spec reserves the bit there and logging every write is just noise. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) { + if (vfio_cxl_reset_capable(vdev)) { + int rc = vfio_cxl_reset(vdev); + + if (rc) + pci_warn(pdev, + "vfio-cxl: CXL reset failed (%d)\n", + rc); + } + } } static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, From aef7e33d0e74584d2efb7634860609bd651a3439 Mon Sep 17 00:00:00 2001 From: Jiandi An Date: Tue, 5 May 2026 16:20:03 -0500 Subject: [PATCH 51/51] NVIDIA: VR: SAUCE: config: Enable CONFIG_VFIO_CXL_CORE for CXL Type-2 passthrough Enable VFIO CXL core support on amd64 and arm64 to allow CXL Type-2 device passthrough via vfio-pci. Signed-off-by: Jiandi An --- debian.nvidia-6.17/config/annotations | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index bde6de3efe4de..da33e0be1fd0a 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -255,6 +255,8 @@ CONFIG_UBUNTU_ODM_DRIVERS note<'Disable all Ubuntu ODM dri CONFIG_ULTRASOC_SMB policy<{'arm64': 'n'}> CONFIG_ULTRASOC_SMB note<'Required for Grace enablement'> +CONFIG_VFIO_CXL_CORE policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_VFIO_CXL_CORE note<'Enable VFIO CXL core for CXL Type-2 device passthrough support'> # ---- Annotations without notes ----