diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 3e2a270bd8282..f0c30284befbe 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -47,6 +47,7 @@ of interest to most developers working on device drivers. vfio-mediated-device vfio vfio-pci-device-specific-driver-acceptance + vfio-pci-cxl Bus-level documentation ======================= diff --git a/Documentation/driver-api/vfio-pci-cxl.rst b/Documentation/driver-api/vfio-pci-cxl.rst new file mode 100644 index 0000000000000..1256e4d33fc67 --- /dev/null +++ b/Documentation/driver-api/vfio-pci-cxl.rst @@ -0,0 +1,382 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +VFIO PCI CXL Type-2 device passthrough +======================================= + +Overview +-------- + +Type-2 CXL devices are PCIe accelerators (GPUs, compute ASICs, and similar) +with coherent device memory on CXL.mem. DPA is mapped into host physical +address space through HDM decoders that the kernel's CXL subsystem owns. A +guest cannot program that hardware directly. + +This ``vfio-pci`` mode hands a VMM: + +- A read/write VFIO device region (COMP_REGS) that emulates the HDM decoder + register block with CXL register rules enforced in kernel code. +- A mmapable VFIO device region (DPA) backed by the kernel-chosen host physical + range for device memory. +- DVSEC config-space emulation so the guest cannot change host-owned CXL.io / + CXL.mem enable bits. + +Build with ``CONFIG_VFIO_CXL_CORE=y``. At runtime you can turn it off with:: + + modprobe vfio-pci disable_cxl=1 + +or, in a variant driver, set ``vdev->disable_cxl = true`` before registration. + + +Device detection +---------------- + +At ``vfio_pci_core_register_device()`` the driver checks for a Type-2 style +setup. All of the following must hold: + +1. CXL Device DVSEC present (PCIe DVSEC Vendor ID ``0x1E98``, DVSEC ID + ``0x0000``). +2. ``Mem_Capable`` (bit 2) set in the CXL Capability register inside that DVSEC. +3. PCI class code is **not** ``0x050210`` (CXL Type-3 memory expander). +4. An HDM Decoder capability block reachable through the Register Locator DVSEC. +5. At least one HDM decoder committed by firmware with non-zero size. + +The CXL spec labels "Type-2" as devices with both ``Mem_Capable`` and +``Cache_Capable``. This driver also takes ``Mem_Capable``-only devices +(``Cache_Capable=0``), which behave like Type-3 style accelerators without the +usual class code. ``VFIO_CXL_CAP_CACHE_CAPABLE`` exposes the cache bit to +userspace so a VMM can treat FLR differently when needed. + +When detection succeeds, ``VFIO_DEVICE_FLAGS_CXL`` is ORed into +``vfio_device_info.flags`` together with ``VFIO_DEVICE_FLAGS_PCI``. + +.. note:: + + **Firmware must commit an HDM decoder before open.** The driver only + discovers DPA range and size from a decoder that firmware already committed. + Devices without that, or hot-plugged setups that never get it, are out of + scope for now. + + Follow-up options under discussion include CXL range registers in the + Device DVSEC (often enough on single-decoder parts), CDAT over DOE, mailbox + Get Partition Info, or a future DVSEC field from the consortium for + base/size/NUMA without extra side channels. There is also talk of a sysfs + path, modeled on resizable BAR, where an orchestrator fixes the DPA window + before vfio-pci binds so the driver still sees a committed range. + + +UAPI: VFIO_DEVICE_INFO_CAP_CXL +------------------------------ + +When ``VFIO_DEVICE_FLAGS_CXL`` is set, the device info capability chain +includes a ``vfio_device_info_cap_cxl`` structure (cap ID 6, version 1):: + + struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; /* id=6, version=1 */ + __u8 hdm_regs_bar_index; /* BAR index containing component regs */ + __u8 reserved[3]; + __u32 flags; /* VFIO_CXL_CAP_* flags */ + __u64 hdm_regs_offset; /* byte offset within the BAR to the + * CXL.mem register area start. This + * equals comp_reg_offset + CXL_CM_OFFSET + * where CXL_CM_OFFSET = 0x1000. */ + __u32 dpa_region_index; /* VFIO region index for DPA memory */ + __u32 comp_regs_region_index; /* VFIO region index for COMP_REGS */ + }; + /* + * hdm_count and hdm_decoder_offset are intentionally absent from this + * struct. Both are derivable from the COMP_REGS region. See the + * "Deriving HDM info from COMP_REGS" section below. + */ + + #define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) + #define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + +``VFIO_CXL_CAP_FIRMWARE_COMMITTED`` + At least one HDM decoder was pre-committed by firmware. The DPA region + is live at device open; the VMM can map it without waiting for a guest + COMMIT cycle. + +``VFIO_CXL_CAP_CACHE_CAPABLE`` + The device has an HDM-DB decoder (CXL.mem + CXL.cache). This mirrors the + ``Cache_Capable`` bit from the CXL DVSEC Capability register. The kernel + does not run Write-Back Invalidation (WBI) before FLR; with this flag set + that stays the VMM's job. + +DPA region size comes from ``VFIO_DEVICE_GET_REGION_INFO`` on +``dpa_region_index``, not from this struct. + + +VFIO regions +------------ + +A CXL device adds two device regions on top of the usual BARs. Their indices +are in ``dpa_region_index`` and ``comp_regs_region_index``. + +DPA region (``VFIO_REGION_SUBTYPE_CXL``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Flags: ``READ | WRITE | MMAP``. + +The backing store is the host physical range the kernel assigned for DPA. The +kernel maps it with ``memremap(MEMREMAP_WB)`` because CXL device memory on a +coherent link sits in the CPU cache hierarchy. That mapping is normal cached +memory, so ``copy_to/from_user`` works without extra barriers. + +Page faults are lazy: PFNs are installed per page on first touch via +``vmf_insert_pfn``. ``mmap()`` does not populate the whole region up front. + +Region read/write through the fd uses the same ``MEMREMAP_WB`` mapping with +``copy_to/from_user``. ``ioread``/``iowrite`` MMIO helpers are not used on +this path. + +During FLR, ``unmap_mapping_range()`` drops user PTEs and ``region_active`` +clears before the reset runs. Ongoing faults or region I/O then error instead +of touching a dead mapping. IOMMU ATC invalidation from the zap has to finish +before the device resets; doing it the other way around can leave an SMMU +waiting on a device that no longer responds. + +After reset, the region comes back once ``COMMITTED`` shows up again in fresh +HDM hardware state. The VMM can fault pages in again without a new ``mmap()``. + +COMP_REGS region (``VFIO_REGION_SUBTYPE_CXL_COMP_REGS``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Flags: ``READ | WRITE`` (no mmap). + +Emulated registers for the CXL.mem slice of the component register block: the +CXL Capability Array header at offset 0, then the HDM Decoder capability +starting at ``hdm_decoder_offset`` (the byte offset derived by traversing the +CXL Capability Array — see "Deriving HDM info from COMP_REGS" below). +Region size from ``VFIO_DEVICE_GET_REGION_INFO`` covers the full capability +array prefix plus all HDM decoder blocks. + +Only 32-bit, 32-bit-aligned accesses are allowed. 8- and 16-bit attempts get +``-EINVAL``. + +Offsets below ``hdm_decoder_offset`` return the snapshot from device open. +Writes there are dropped (with a WARN); the capability array stays read-only. + +From ``hdm_decoder_offset`` upward the kernel keeps a shadow +(``comp_reg_virt[]``) and applies field rules: + +- At open, hardware HDM state is snapshotted. For firmware-committed decoders + the LOCK bit is cleared and BASE_HI/BASE_LO are zeroed in the shadow so the + VMM can program guest GPA; the host HPA is not carried in the shadow after + that. +- ``COMMIT`` (bit 9 of CTRL): writing 1 sets ``COMMITTED`` (bit 10) in the + shadow immediately. Real hardware stays committed; the shadow tracks what + the guest should see. +- When LOCK is set, writes to BASE_HI and SIZE_HI are ignored so + firmware-committed values survive. + +Region type identifiers:: + + /* type = PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE */ + #define VFIO_REGION_SUBTYPE_CXL 1 /* DPA memory region */ + #define VFIO_REGION_SUBTYPE_CXL_COMP_REGS 2 /* HDM register shadow */ + + +BAR access +---------- + +``VFIO_DEVICE_GET_REGION_INFO`` for ``hdm_regs_bar_index`` reports the full +BAR size with ``READ | WRITE | MMAP`` flags and a +``VFIO_REGION_INFO_CAP_SPARSE_MMAP`` capability listing the GPU or +accelerator register windows — the mmappable parts of the BAR that do **not** +contain CXL component registers. + +The number of sparse areas depends on where the CXL component register block +``[comp_reg_offset, comp_reg_offset + comp_reg_size)`` sits within the BAR: + +* **Topology A** - component block at BAR end: + ``[gpu_regs | comp_regs]`` → 1 area: ``[0, comp_reg_offset)`` + +* **Topology B** - component block at BAR start: + ``[comp_regs | gpu_regs]`` → 1 area: ``[comp_reg_size, bar_len)`` + +* **Topology C** - component block in middle: + ``[gpu_regs | comp_regs | gpu_regs]`` → 2 areas: + ``[0, comp_reg_offset)`` and ``[comp_reg_offset + comp_reg_size, bar_len)`` + +VMMs **must** iterate all ``nr_areas`` entries; do not assume a single area or +that the first area starts at offset zero. + +The GPU/accelerator register windows listed in the sparse capability **are** +physically mmappable: ``mmap()`` on the VFIO device fd at the corresponding +BAR offset succeeds and yields a host-physical-backed mapping suitable for +KVM stage-2 installation. + +The CXL component register block itself **is not** mmappable. Any ``mmap()`` +request whose range overlaps ``[comp_reg_offset, comp_reg_offset + +comp_reg_size)`` returns ``-EINVAL``; those registers must be accessed through +the ``COMP_REGS`` device region. + + +DVSEC configuration space emulation +----------------------------------- + +With ``CONFIG_VFIO_CXL_CORE=y``, vfio-pci installs a handler for +``PCI_EXT_CAP_ID_DVSEC`` (``0x23``) in the config access table. Non-CXL +devices fall through as before. + +On CXL devices, writes to these DVSEC registers are caught and reflected in +``vdev->vconfig`` (shadow config space): + ++--------------------+--------+--------------------------------------------------+ +| Register | Offset | Emulation | ++====================+========+==================================================+ +| CXL Control | +0x0c | RWL; IO_Enable held at 1; locked when Lock | +| | | bit 0 is set. | ++--------------------+--------+--------------------------------------------------+ +| CXL Status | +0x0e | Bit 14 (Viral_Status) is RW1CS. | ++--------------------+--------+--------------------------------------------------+ +| CXL Control2 | +0x10 | Bits 1 and 2 forwarded to hardware. | ++--------------------+--------+--------------------------------------------------+ +| CXL Status2 | +0x12 | Bit 3 forwarded when Capability3 bit 3 is set. | ++--------------------+--------+--------------------------------------------------+ +| CXL Lock | +0x14 | RWO; once set, Control becomes read-only until | +| | | conventional reset. | ++--------------------+--------+--------------------------------------------------+ +| Range Base Hi/Lo | varies | Stored in vconfig; Base Low [27:0] reserved bits | +| | | cleared on write. | ++--------------------+--------+--------------------------------------------------+ + +Reads return the shadow. Read-only registers (Capability, Size High/Low) are +filled from hardware at open. + + +FLR and reset +------------- + +FLR goes through ``vfio_pci_ioctl_reset()``. The CXL-specific part is: + +1. ``vfio_cxl_zap_region_locked()`` runs under the write side of + ``memory_lock``. It clears ``region_active`` and calls + ``unmap_mapping_range()`` on the DPA inode mapping so user PTEs go away. + Concurrent faults or fd I/O hit the inactive flag and error. IOMMU ATC must + drain before reset (see the DPA region notes above). + +2. After FLR, ``vfio_cxl_reactivate_region()`` reads HDM hardware again into + ``comp_reg_virt[]``. If ``COMMITTED`` is set (common when firmware left the + decoder committed), ``region_active`` turns back on and the VMM can refault + without remapping. + + +Known limitations +----------------- + +**Pre-committed HDM decoder required** + See `Device detection`_ and the note there. + +**CXL hot-plug not supported** + Slots need to be present and programmed by firmware at boot. + +**CXL.cache Write-Back Invalidation not implemented** + For HDM-DB devices (``VFIO_CXL_CAP_CACHE_CAPABLE``), the kernel does not + run WBI before FLR. The VMM must do it and expose Back-Invalidation in the + guest topology where required. + + +VMM integration notes +--------------------- + +For a ``VFIO_CXL_CAP_FIRMWARE_COMMITTED`` device (what works today):: + + /* 1. Get device info and locate the CXL cap */ + vfio_device_get_info(fd, &dinfo); + assert(dinfo.flags & VFIO_DEVICE_FLAGS_CXL); + cxl = find_cap(&dinfo, VFIO_DEVICE_INFO_CAP_CXL); + + /* 2. Get DPA and COMP_REGS region sizes */ + get_region_info(fd, cxl->dpa_region_index, &dpa_ri); + get_region_info(fd, cxl->comp_regs_region_index, &comp_ri); + + /* 3. Map DPA region at a guest physical address */ + gpa_base = allocate_guest_phys(dpa_ri.size); + mmap(gpa_base, dpa_ri.size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, vfio_fd, + (off_t)cxl->dpa_region_index << VFIO_PCI_OFFSET_SHIFT); + + /* 4. Derive hdm_decoder_offset from COMP_REGS (see section below) */ + uint64_t hdm_decoder_offset = derive_hdm_offset(vfio_fd, comp_ri); + + /* 5. Write guest GPA into HDM Decoder 0 BASE via COMP_REGS pwrite */ + u32 base_hi = gpa_base >> 32; + comp_off = (off_t)cxl->comp_regs_region_index << VFIO_PCI_OFFSET_SHIFT; + pwrite(vfio_fd, &base_hi, 4, + comp_off + hdm_decoder_offset + CXL_HDM_DECODER0_BASE_HIGH_OFFSET); + + /* 6. Build guest CXL topology using gpa_base and dpa_ri.size */ + build_cfmws(gpa_base, dpa_ri.size); + + /* 7. If CACHE_CAPABLE: issue WBI before any guest FLR */ + +Extra detail: + +- DPA size is ``dpa_ri.size`` from region info. +- ``CXL_HDM_DECODER0_BASE_HIGH_OFFSET`` lives in ``include/uapi/cxl/cxl_regs.h``. +- On the BAR, ``mmaps[0].size`` from the sparse-mmap cap on + ``hdm_regs_bar_index`` splits GPU MMIO (BAR fd) from the CXL block (COMP_REGS + region). +- If ``VFIO_CXL_CAP_CACHE_CAPABLE`` is set, the guest CXL topology should + advertise Back-Invalidation and the VMM should run WBI before FLR. + + +Deriving HDM info from COMP_REGS +--------------------------------- + +``hdm_decoder_offset`` and ``hdm_count`` are not in ``vfio_device_info_cap_cxl`` +because both are directly readable from the ``COMP_REGS`` region. + +**Finding hdm_decoder_offset:** + +Read dwords from the COMP_REGS region starting at offset 0 (the CXL Capability +Array). ``comp_off`` is the VFIO file offset for the COMP_REGS region: +``(off_t)cxl->comp_regs_region_index << VFIO_PCI_OFFSET_SHIFT``:: + + /* Dword 0: CXL Capability Array Header */ + pread(fd, &hdr, 4, comp_off + 0); + /* bits[15:0] must be 1 (CM_CAP_HDR_CAP_ID) */ + /* bits[31:24] = number of capability entries */ + num_caps = (hdr >> 24) & 0xff; /* CXL_CM_CAP_HDR_ARRAY_SIZE_MASK */ + + /* Walk entries at dword 1..num_caps */ + for (i = 1; i <= num_caps; i++) { + pread(fd, &entry, 4, comp_off + i * 4); + cap_id = entry & 0xffff; /* CXL_CM_CAP_HDR_ID_MASK */ + if (cap_id == 0x5) { /* CXL_CM_CAP_CAP_ID_HDM */ + hdm_decoder_offset = (entry >> 20) & 0xfff; /* CXL_CM_CAP_PTR_MASK */ + break; + } + } + +**Finding hdm_count:** + +Read the HDM Decoder Capability register (HDMC) at ``hdm_decoder_offset + 0``:: + + pread(fd, &hdmc, 4, comp_off + hdm_decoder_offset); + field = hdmc & 0xf; /* CXL_HDM_DECODER_COUNT_MASK bits[3:0] */ + hdm_count = field ? field * 2 : 1; /* 0→1, N→N*2 decoders */ + +All constants are in ``include/uapi/cxl/cxl_regs.h``. + + +Kernel configuration +-------------------- + +``CONFIG_VFIO_CXL_CORE`` (bool) + CXL Type-2 passthrough in ``vfio-pci-core``. Needs ``CONFIG_VFIO_PCI_CORE``, + ``CONFIG_CXL_BUS``, and ``CONFIG_CXL_MEM``. + +References +---------- + +* CXL Specification 4.0, 8.1.3 - PCIe DVSEC for CXL Devices +* CXL Specification 4.0, 8.2.4.20 - CXL HDM Decoder Capability Structure +* ``include/uapi/linux/vfio.h`` - ``VFIO_DEVICE_INFO_CAP_CXL``, + ``VFIO_REGION_SUBTYPE_CXL``, ``VFIO_REGION_SUBTYPE_CXL_COMP_REGS`` +* ``include/uapi/cxl/cxl_regs.h`` - ``CXL_CM_OFFSET``, + ``CXL_CM_CAP_HDR_ARRAY_SIZE_MASK``, ``CXL_CM_CAP_HDR_ID_MASK``, + ``CXL_CM_CAP_PTR_MASK``, ``CXL_HDM_DECODER_COUNT_MASK``, + ``CXL_HDM_DECODER0_BASE_HIGH_OFFSET`` diff --git a/debian.nvidia-6.17/config/annotations b/debian.nvidia-6.17/config/annotations index bde6de3efe4de..da33e0be1fd0a 100644 --- a/debian.nvidia-6.17/config/annotations +++ b/debian.nvidia-6.17/config/annotations @@ -255,6 +255,8 @@ CONFIG_UBUNTU_ODM_DRIVERS note<'Disable all Ubuntu ODM dri CONFIG_ULTRASOC_SMB policy<{'arm64': 'n'}> CONFIG_ULTRASOC_SMB note<'Required for Grace enablement'> +CONFIG_VFIO_CXL_CORE policy<{'amd64': 'y', 'arm64': 'y'}> +CONFIG_VFIO_CXL_CORE note<'Enable VFIO CXL core for CXL Type-2 device passthrough support'> # ---- Annotations without notes ---- diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index dae2e4c36e53a..cba2146b357d6 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3005,11 +3005,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5696,6 +5721,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5834,6 +5860,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 497d99b8908d0..2b0e368a174d1 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -147,16 +147,24 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) return 0; } -/* - * Wait up to @media_ready_timeout for the device to report memory - * active. +/** + * cxl_await_range_active - Wait for all HDM DVSEC memory ranges to be active + * @cxlds: CXL device state (DVSEC and HDM count must be valid) + * + * For each HDM decoder range reported in the CXL DVSEC capability, waits for + * the range to report MEM INFO VALID (up to 1s per range), then MEM ACTIVE + * (up to media_ready_timeout seconds per range, default 60s). Used by + * cxl_await_media_ready() and by callers that only need range readiness + * without checking the memory device status register. + * + * Return: 0 if all ranges become valid and active, -ETIMEDOUT if a timeout + * occurs, or a negative errno from config read on failure. */ -int cxl_await_media_ready(struct cxl_dev_state *cxlds) +int cxl_await_range_active(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; int rc, i, hdm_count; - u64 md_status; u16 cap; rc = pci_read_config_word(pdev, @@ -177,6 +185,23 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) return rc; } + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_range_active, "CXL"); + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + u64 md_status; + int rc; + + rc = cxl_await_range_active(cxlds); + if (rc) + return rc; + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO; @@ -454,6 +479,35 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL"); +/** + * cxl_get_hdm_info - Get HDM decoder register block location and count + * @cxlds: CXL device state (must have component regs enumerated via + * cxl_probe_component_regs()) + * @count: number of HDM decoders in the block (from HDM Capability bits [3:0]) + * @offset: byte offset of HDM decoder block within the component register BAR + * @size: size in bytes of the HDM decoder block + * + * Return: 0 on success. -ENODEV if the HDM decoder block is not present. + */ +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ + struct cxl_reg_map *hdm = &cxlds->reg_map.component_map.hdm_decoder; + + if (WARN_ON(!count || !offset || !size)) + return -EINVAL; + + if (!hdm->valid) + return -ENODEV; + + *count = hdm->count; + *offset = hdm->offset; + *size = hdm->size; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_hdm_info, "CXL"); + #define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff #define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 #define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 @@ -1183,7 +1237,7 @@ static void cxl_pci_functions_reset_done(struct cxl_reset_context *ctx) /* * CXL device reset execution */ -static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en) { static const u32 reset_timeout_ms[] = { 10, 100, 1000, 10000, 100000 }; u16 cap, ctrl2, status2; @@ -1253,7 +1307,17 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) if (rc) return rc; - ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + /* + * Explicitly set or clear RST_MEM_CLR_EN rather than only + * setting it. A previous reset may have left the bit set in + * hardware; if mem_clr_en is false we must clear it so that a + * stale bit does not cause an unwanted memory-clearing reset. + */ + if (mem_clr_en) + ctrl2 |= PCI_DVSEC_CXL_RST_MEM_CLR_EN; + else + ctrl2 &= ~PCI_DVSEC_CXL_RST_MEM_CLR_EN; + rc = pci_write_config_word(pdev, dvsec + PCI_DVSEC_CXL_CTRL2, ctrl2); if (rc) @@ -1302,6 +1366,7 @@ static int cxl_dev_reset(struct pci_dev *pdev, int dvsec) return 0; } +EXPORT_SYMBOL_NS_GPL(cxl_dev_reset, "CXL"); static int match_memdev_by_parent(struct device *dev, const void *parent) { @@ -1341,7 +1406,7 @@ static int cxl_do_reset(struct pci_dev *pdev) pci_dev_save_and_disable(pdev); cxl_pci_functions_reset_prepare(&ctx); - rc = cxl_dev_reset(pdev, dvsec); + rc = cxl_dev_reset(pdev, dvsec, true); cxl_pci_functions_reset_done(&ctx); @@ -1370,7 +1435,7 @@ static int cxl_do_reset(struct pci_dev *pdev) * devices under bus core serialization. */ -static bool pci_cxl_reset_capable(struct pci_dev *pdev) +bool pci_cxl_reset_capable(struct pci_dev *pdev) { int dvsec; u16 cap; @@ -1389,6 +1454,7 @@ static bool pci_cxl_reset_capable(struct pci_dev *pdev) return !!(cap & PCI_DVSEC_CXL_RST_CAPABLE); } +EXPORT_SYMBOL_NS_GPL(pci_cxl_reset_capable, "CXL"); static ssize_t cxl_reset_store(struct device *dev, struct device_attribute *attr, diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 20c2d9fbcfe7d..43661e51230a2 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -85,6 +85,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, decoder_cnt = cxl_hdm_decoder_count(hdr); length = 0x20 * decoder_cnt + 0x10; rmap = &map->hdm_decoder; + rmap->count = decoder_cnt; break; } case CXL_CM_CAP_CAP_ID_RAS: @@ -287,9 +288,37 @@ static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, map->reg_type = reg_type; map->resource = pci_resource_start(pdev, bar) + offset; map->max_size = pci_resource_len(pdev, bar) - offset; + map->bar_index = bar; + map->bar_offset = offset; return true; } +/** + * cxl_regblock_get_bar_info() - Get BAR index and offset for a BAR-backed + * regblock + * @map: Register map from cxl_find_regblock() or cxl_find_regblock_instance() + * @bar_index: Output BAR index (0-5). Optional, may be NULL. + * @bar_offset: Output offset within the BAR. Optional, may be NULL. + * + * When the register block was found via the Register Locator DVSEC and + * lives in a PCI BAR (BIR 0-5), this returns the BAR index and the offset + * within that BAR. + * + * Return: 0 if the regblock is BAR-backed (bar_index <= 5), -EINVAL otherwise. + */ +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset) +{ + if (!map || map->bar_index == 0xff) + return -EINVAL; + if (bar_index) + *bar_index = map->bar_index; + if (bar_offset) + *bar_offset = map->bar_offset; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_regblock_get_bar_info, "CXL"); + /* * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index * Use CXL_INSTANCES_COUNT for @index if counting instances. @@ -308,6 +337,7 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty *map = (struct cxl_register_map) { .host = &pdev->dev, + .bar_index = 0xFF, .resource = CXL_RESOURCE_NONE, }; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index f84910ba7fa2b..772cea6932109 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -145,8 +145,6 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw) #define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48) #define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20 -void cxl_probe_component_regs(struct device *dev, void __iomem *base, - struct cxl_component_reg_map *map); void cxl_probe_device_regs(struct device *dev, void __iomem *base, struct cxl_device_reg_map *map); int cxl_map_device_regs(const struct cxl_register_map *map, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 69830a5c49d3f..96d23717684f7 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1140,6 +1140,126 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, return func(vgpu, index, start, count, flags, data); } +static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_region_info_cap_sparse_mmap *sparse = NULL; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + int nr_areas = 1; + int cap_type_id; + unsigned int i; + int ret; + + switch (info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->gvt->device_info.cfg_space_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->cfg_space.bar[info->index].size; + if (!info->size) { + info->flags = 0; + break; + } + + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + break; + case VFIO_PCI_BAR2_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info->size = gvt_aperture_sz(vgpu->gvt); + + sparse = kzalloc(struct_size(sparse, areas, nr_areas), + GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->areas[0].offset = + PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].size = vgpu_aperture_sz(vgpu); + break; + + case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info bar:%d\n", info->index); + break; + + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info index:%d\n", info->index); + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; + + if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + return -EINVAL; + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + + i = info->index - VFIO_PCI_NUM_REGIONS; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->region[i].size; + info->flags = vgpu->region[i].flags; + + cap_type.type = vgpu->region[i].type; + cap_type.subtype = vgpu->region[i].subtype; + + ret = vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + } + } + + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = vfio_info_add_capability( + caps, &sparse->header, + struct_size(sparse, areas, sparse->nr_areas)); + if (ret) { + kfree(sparse); + return ret; + } + break; + default: + kfree(sparse); + return -EINVAL; + } + } + + kfree(sparse); + return 0; +} + static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, unsigned long arg) { @@ -1168,157 +1288,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned int i; - int ret; - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - int nr_areas = 1; - int cap_type_id; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - break; - case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); - - sparse = kzalloc(struct_size(sparse, areas, nr_areas), - GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); - sparse->areas[0].size = vgpu_aperture_sz(vgpu); - break; - - case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info bar:%d\n", info.index); - break; - - case VFIO_PCI_ROM_REGION_INDEX: - case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info index:%d\n", info.index); - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= VFIO_PCI_NUM_REGIONS + - vgpu->num_regions) - return -EINVAL; - info.index = - array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vgpu->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = - VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; - - cap_type.type = vgpu->region[i].type; - cap_type.subtype = vgpu->region[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } - } - - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { - switch (cap_type_id) { - case VFIO_REGION_INFO_CAP_SPARSE_MMAP: - ret = vfio_info_add_capability(&caps, - &sparse->header, - struct_size(sparse, areas, - sparse->nr_areas)); - if (ret) { - kfree(sparse); - return ret; - } - break; - default: - kfree(sparse); - return -EINVAL; - } - } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - - kfree(sparse); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -1475,6 +1444,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, + .get_region_info_caps = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index ea532a8a4a0c2..a596f6013019c 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, - struct vfio_region_info *info, - unsigned long arg) +static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); int i; switch (info->index) { @@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, return 0; default: /* all other regions are handled via capability chain */ { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_type cap_type = { .header.id = VFIO_REGION_INFO_CAP_TYPE, .header.version = 1 }; @@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, cap_type.type = private->region[i].type; cap_type.subtype = private->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; - - info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info->argsz < sizeof(*info) + caps.size) { - info->argsz = sizeof(*info) + caps.size; - info->cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(*info)); - if (copy_to_user((void __user *)arg + sizeof(*info), - caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info->cap_offset = sizeof(*info); - } - - kfree(caps.buf); - } } return 0; @@ -532,24 +516,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -627,6 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, + .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad76865..253031b86b60a 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, - struct vfio_region_info __user *arg) +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index f65d91c01f2ec..3985613e6830b 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + + if (info->index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -587,6 +581,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f546652..878f95b4d7923 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -65,6 +65,8 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/cxl/Kconfig" + source "drivers/vfio/pci/qat/Kconfig" endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c8..21178d9e34849 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-core-$(CONFIG_VFIO_CXL_CORE) += cxl/vfio_cxl_core.o cxl/vfio_cxl_emu.o cxl/vfio_cxl_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o diff --git a/drivers/vfio/pci/cxl/Kconfig b/drivers/vfio/pci/cxl/Kconfig new file mode 100644 index 0000000000000..fad53300fecfb --- /dev/null +++ b/drivers/vfio/pci/cxl/Kconfig @@ -0,0 +1,9 @@ +config VFIO_CXL_CORE + bool "VFIO CXL core" + depends on VFIO_PCI_CORE && CXL_BUS && CXL_MEM + help + Extends vfio-pci-core with CXL.mem passthrough for vendor-specific + CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or HDM-DB + decoders without the standard CXL memory expander class code + (PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 accelerators and + non-class-code Type-3 variants (e.g. compressed memory devices). diff --git a/drivers/vfio/pci/cxl/vfio_cxl_config.c b/drivers/vfio/pci/cxl/vfio_cxl_config.c new file mode 100644 index 0000000000000..5d13b5d5bc5b8 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_config.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CXL DVSEC configuration space emulation for vfio-pci. + * + * Integrates into the existing vfio-pci-core ecap_perms[] framework using + * vdev->vconfig as the sole shadow buffer for DVSEC registers. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +static inline u16 _cxlds_get_dvsec(struct vfio_pci_cxl_state *cxl) +{ + return (u16)cxl->cxlds.cxl_dvsec; +} + +/* Helpers to access vdev->vconfig at a DVSEC-relative offset */ +static inline u16 dvsec_virt_read16(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return le16_to_cpu(*(u16 *)(vdev->vconfig + dvsec + off)); +} + +static inline void dvsec_virt_write16(struct vfio_pci_core_device *vdev, + u16 off, u16 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + *(u16 *)(vdev->vconfig + dvsec + off) = cpu_to_le16(val); +} + +static inline u32 dvsec_virt_read32(struct vfio_pci_core_device *vdev, + u16 off) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + return le32_to_cpu(*(u32 *)(vdev->vconfig + dvsec + off)); +} + +static inline void dvsec_virt_write32(struct vfio_pci_core_device *vdev, + u16 off, u32 val) +{ + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + + *(u32 *)(vdev->vconfig + dvsec + off) = cpu_to_le32(val); +} + +/* Individual DVSEC register write handlers */ + +static void cxl_dvsec_control_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 lock = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL_RESERVED_MASK; + + if (lock & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; /* register is locked after first write */ + + if (!(cap3 & CXL_DVSEC_CAP3_P2P_MEM_CAPABLE)) + rev_mask |= CXL_CTRL_P2P_REV_MASK; + + new_val &= ~rev_mask; + new_val |= CXL_DVSEC_CTRL_IO_ENABLE; /* IO_Enable always returns 1 */ + + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, new_val); +} + +static void cxl_dvsec_status_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS_OFFSET); + + /* + * VIRAL_STATUS (bit 14) is the only writable bit; all others are + * reserved and always zero. + */ + new_val = cur_val & ~(new_val & CXL_DVSEC_STATUS_VIRAL_STATUS); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS_OFFSET, new_val); +} + +/** + * vfio_cxl_reset - Service a guest CXL protocol reset. + * @vdev: VFIO PCI core device + * + * Unlike cxl_do_reset(), no host memory offlining is performed: the DPA + * region is guest memory, not host RAM. + * + * memory_lock is held for the entire sequence so neither BAR nor DPA + * mappings can fault back in. INIT_CXL_RST is not forwarded to hardware; + * cxl_dev_reset() drives the state machine directly. + * + * STATUS2 outcome bits are written back to vconfig on return so that the + * guest can poll for completion without going to hardware. + * + * Return: 0 on success, negative error code on failure. + */ +static int vfio_cxl_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(cxl); + u16 hw_status2 = 0; + int ret; + + vfio_pci_zap_and_down_write_memory_lock(vdev); + + /* + * CXL r4.0 Table 8-9: device must clear CXL_Reset_Complete before + * starting the reset flow, on the 0->1 transition of Initiate_CXL_Reset. + * Clear both reset outcome bits so a polling guest sees an unambiguous + * in-progress state rather than a stale result from a prior attempt. + */ + { + u16 s = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + + s &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, s); + } + + vfio_cxl_prepare_reset(vdev); + + pci_dev_lock(pdev); + + pci_dev_save_and_disable(pdev); + ret = cxl_dev_reset(pdev, cxl->cxlds.cxl_dvsec, + !!(dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL2_OFFSET) & + CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE)); + pci_dev_restore(pdev); + + pci_dev_unlock(pdev); + + vfio_cxl_finish_reset(vdev); + + /* + * Re-read STATUS2 from hardware after restore. pci_dev_restore() + * writes back the pre-reset saved state, which has both outcome bits + * cleared. Re-reading also picks up genuine hardware changes (e.g. + * VOLATILE_HDM_PRES_ERROR clearing) before stamping in the outcome. + */ + pci_read_config_word(pdev, dvsec + CXL_DVSEC_STATUS2_OFFSET, + &hw_status2); + hw_status2 &= ~(CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR); + if (ret) + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + else + hw_status2 |= CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, hw_status2); + + up_write(&vdev->memory_lock); + return ret; +} + +static void cxl_dvsec_control2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + struct pci_dev *pdev = vdev->pdev; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_CONTROL2_OFFSET; + u16 cap2 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY2_OFFSET); + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 rev_mask = CXL_CTRL2_RESERVED_MASK; + + if (!(cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY)) + rev_mask |= CXL_CTRL2_VOLATILE_HDM_REV_MASK; + if (!(cap2 & CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE)) + rev_mask |= CXL_CTRL2_MODIFIED_COMP_REV_MASK; + + new_val &= ~rev_mask; + + /* Cache WBI: forward to hardware. */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI) + pci_write_config_word(pdev, abs_off, + CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI); + + /* + * Commit the new CONTROL2 value to the shadow before triggering a + * reset. vfio_cxl_reset() reads Mem_Clr_Enable (bit 3) from the + * shadow; if the shadow is written after the reset call, a guest write + * that changes bit 3 in the same access as INITIATE_CXL_RESET would + * reset with the stale bit 3 value instead of the one just written. + */ + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL2_OFFSET, + new_val & ~CXL_CTRL2_HW_BITS_MASK); + + /* + * INIT_CXL_RST: not forwarded to hardware. cxl_dev_reset() drives + * the state machine; forwarding it after the reset would fire a + * second one. Drop writes on non-RST_CAPABLE devices silently; the + * spec reserves the bit there and logging every write is just noise. + */ + if (new_val & CXL_DVSEC_CTRL2_INITIATE_CXL_RESET) { + if (vfio_cxl_reset_capable(vdev)) { + int rc = vfio_cxl_reset(vdev); + + if (rc) + pci_warn(pdev, + "vfio-cxl: CXL reset failed (%d)\n", + rc); + } + } +} + +static void cxl_dvsec_status2_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cap3 = dvsec_virt_read16(vdev, CXL_DVSEC_CAPABILITY3_OFFSET); + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = dvsec + CXL_DVSEC_STATUS2_OFFSET; + + /* + * VOLATILE_HDM_PRES_ERROR (bit 3) is RW1CS. Forward to hardware, + * then mirror the clear into vconfig. Reads come from the shadow + * now, so skipping the update leaves the bit stuck from the guest's + * view. + * + * All other STATUS2 bits are RO hardware outputs; ignore guest writes. + */ + if ((cap3 & CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY) && + (new_val & CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR)) { + u16 v; + + pci_write_config_word(vdev->pdev, abs_off, + CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR); + v = dvsec_virt_read16(vdev, CXL_DVSEC_STATUS2_OFFSET); + v &= ~CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR; + dvsec_virt_write16(vdev, CXL_DVSEC_STATUS2_OFFSET, v); + } +} + +static void cxl_dvsec_lock_write(struct vfio_pci_core_device *vdev, + u16 new_val) +{ + u16 cur_val = dvsec_virt_read16(vdev, CXL_DVSEC_LOCK_OFFSET); + + /* Once the LOCK bit is set it can only be cleared by conventional reset */ + if (cur_val & CXL_DVSEC_LOCK_CONFIG_LOCK) + return; + + new_val &= ~CXL_LOCK_RESERVED_MASK; + dvsec_virt_write16(vdev, CXL_DVSEC_LOCK_OFFSET, new_val); +} + +static void cxl_range_base_lo_write(struct vfio_pci_core_device *vdev, + u16 dvsec_off, u32 new_val) +{ + new_val &= ~CXL_BASE_LO_RESERVED_MASK; + dvsec_virt_write32(vdev, dvsec_off, new_val); +} + +/* + * status2_hw_shadow_merge - read STATUS2, merging hardware and vconfig shadow. + * + * RESET_COMPLETE and RESET_ERROR are written into vconfig by vfio_cxl_reset() + * after a protocol reset; pci_dev_restore() clears them from hardware, so they + * must survive in the shadow for a polling guest to see the reset outcome. + * + * All other STATUS2 bits are live hardware outputs and must come from hardware. + * In particular, CACHE_INVALID (bit 0) is polled by guests during a standalone + * write-back invalidation. + * + * @abs_pos: absolute PCI config space byte offset of the STATUS2 register. + */ +static u16 status2_hw_shadow_merge(struct vfio_pci_core_device *vdev, int abs_pos) +{ + const u16 shadow_mask = CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE | + CXL_DVSEC_STATUS2_CXL_RESET_ERROR; + u16 hw = 0, virt; + + pci_read_config_word(vdev->pdev, abs_pos, &hw); + virt = get_unaligned_le16(vdev->vconfig + abs_pos); + return (hw & ~shadow_mask) | (virt & shadow_mask); +} + +/** + * vfio_cxl_dvsec_readfn - Per-device DVSEC read handler for CXL capable devices. + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to read + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Output buffer for the read value (little-endian) + * + * Called via vfio_pci_dvsec_dispatch_read() for CXL devices. Returns shadow + * vconfig values for virtualized DVSEC registers (CONTROL, STATUS, CONTROL2, + * LOCK) so that userspace reads reflect emulated state rather than raw + * hardware. All other DVSEC bytes pass through to vfio_raw_config_read(). + * + * A 4-byte (DWORD) access at the CONTROL2 offset spans both CONTROL2 and + * STATUS2 since CONTROL2 is DWORD-aligned and the two registers are adjacent. + * In that case STATUS2 is returned via the hardware-merge path. + * + * Return: @count on success, or negative error code from the fallback read. + */ +static int vfio_cxl_dvsec_readfn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 dvsec_off; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_read(vdev, pos, count, perm, offset, val); + + dvsec_off = (u16)pos - dvsec; + + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + case CXL_DVSEC_STATUS_OFFSET: + case CXL_DVSEC_LOCK_OFFSET: + /* Fully virtualised; return shadow. */ + memcpy(val, vdev->vconfig + pos, count); + return count; + case CXL_DVSEC_CONTROL2_OFFSET: + if (count == 4) { + /* + * A 4-byte access at the DWORD-aligned CONTROL2 offset + * spans both CONTROL2 (low 16 bits) and STATUS2 (high 16 + * bits). Return CONTROL2 from vconfig and STATUS2 via the + * hardware-merge path so that CACHE_INVALID is fresh. + */ + __le32 combined = cpu_to_le32( + (u32)get_unaligned_le16(vdev->vconfig + pos) | + ((u32)status2_hw_shadow_merge(vdev, + dvsec + CXL_DVSEC_STATUS2_OFFSET) << 16)); + memcpy(val, &combined, 4); + } else { + memcpy(val, vdev->vconfig + pos, count); + } + return count; + case CXL_DVSEC_STATUS2_OFFSET: { + __le16 result = cpu_to_le16(status2_hw_shadow_merge(vdev, pos)); + memcpy(val, &result, count); + return count; + } + default: + return vfio_raw_config_read(vdev, pos, count, + perm, offset, val); + } +} + +/** + * vfio_cxl_dvsec_writefn - ecap_perms write handler for PCI_EXT_CAP_ID_DVSEC. + * + * Installed once into ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn by + * vfio_pci_init_perm_bits() when CONFIG_VFIO_CXL_CORE=y. Applies to every + * device opened under vfio-pci; the vdev->cxl NULL check distinguishes CXL + * devices from non-CXL devices that happen to expose a DVSEC capability. + * + * @vdev: VFIO PCI core device + * @pos: Absolute byte position in PCI config space + * @count: Number of bytes to write + * @perm: Permission bits for this capability (passed through to fallback) + * @offset: Byte offset within the capability structure (passed through) + * @val: Value to write (little-endian) + * + * Return: @count on success; non-CXL devices continue to + * vfio_raw_config_write() which also returns @count or negative error. + */ +static int vfio_cxl_dvsec_writefn(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u16 dvsec = _cxlds_get_dvsec(vdev->cxl); + u16 abs_off = (u16)pos; + u16 dvsec_off; + u16 wval16; + u32 wval32; + + if (!cxl || (u16)pos < dvsec || + (u16)pos >= dvsec + cxl->dvsec_len) + return vfio_raw_config_write(vdev, pos, count, perm, + offset, val); + + pci_dbg(vdev->pdev, + "vfio_cxl: DVSEC write: abs=0x%04x dvsec_off=0x%04x count=%d raw_val=0x%08x\n", + abs_off, abs_off - dvsec, count, le32_to_cpu(val)); + + dvsec_off = abs_off - dvsec; + + /* Route to the appropriate per-register handler */ + switch (dvsec_off) { + case CXL_DVSEC_CONTROL_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control_write(vdev, wval16); + break; + case CXL_DVSEC_STATUS_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status_write(vdev, wval16); + break; + case CXL_DVSEC_CONTROL2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_control2_write(vdev, wval16); + if (count == 4) { + /* + * High half of a 32-bit write at CONTROL2 is STATUS2. + * Forward to the STATUS2 handler so RW1CS bits (e.g. + * VOLATILE_HDM_PRES_ERROR) are not silently dropped. + */ + wval16 = (u16)(le32_to_cpu(val) >> 16); + cxl_dvsec_status2_write(vdev, wval16); + } + break; + case CXL_DVSEC_STATUS2_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_status2_write(vdev, wval16); + break; + case CXL_DVSEC_LOCK_OFFSET: + wval16 = (u16)le32_to_cpu(val); + cxl_dvsec_lock_write(vdev, wval16); + break; + case CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET: + case CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET: + wval32 = le32_to_cpu(val); + dvsec_virt_write32(vdev, dvsec_off, wval32); + break; + case CXL_DVSEC_RANGE1_BASE_LOW_OFFSET: + case CXL_DVSEC_RANGE2_BASE_LOW_OFFSET: + wval32 = le32_to_cpu(val); + cxl_range_base_lo_write(vdev, dvsec_off, wval32); + break; + default: + /* RO registers: header, capability, range sizes - discard */ + break; + } + + return count; +} + +/** + * vfio_cxl_setup_dvsec_perms - Install per-device CXL DVSEC read/write hooks. + * @vdev: VFIO PCI core device + * + * Called once per device open after vfio_config_init() has seeded vdev->vconfig + * from hardware. Installs vfio_cxl_dvsec_readfn and vfio_cxl_dvsec_writefn + * as per-device DVSEC handlers so that the global ecap_perms[DVSEC] dispatcher + * routes reads and writes through CXL-aware emulation. + * + * Forces CXL.io IO_ENABLE in the CONTROL vconfig shadow at init time so the + * initial guest read returns the correct value before the first write. + */ +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) +{ + u16 ctrl = dvsec_virt_read16(vdev, CXL_DVSEC_CONTROL_OFFSET); + + vdev->dvsec_readfn = vfio_cxl_dvsec_readfn; + vdev->dvsec_writefn = vfio_cxl_dvsec_writefn; + + /* Force IO_ENABLE; cxl_dvsec_control_write() maintains this invariant. */ + ctrl |= CXL_DVSEC_CTRL_IO_ENABLE; + dvsec_virt_write16(vdev, CXL_DVSEC_CONTROL_OFFSET, ctrl); +} +EXPORT_SYMBOL_GPL(vfio_cxl_setup_dvsec_perms); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c new file mode 100644 index 0000000000000..d9b349225df92 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO CXL Core - CXL.mem passthrough for vendor-specific CXL devices + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * + * This module extends vfio-pci-core to pass through CXL.mem regions for + * vendor-specific CXL devices (CXL_DEVTYPE_DEVMEM) that implement HDM-D or + * HDM-DB decoders but do not report the standard CXL memory expander class + * code (PCI_CLASS_MEMORY_CXL, 0x0502). This covers both CXL Type-2 + * accelerators (with CXL.cache) and non-class-code Type-3 variants (e.g. + * compressed memory devices) which cannot be paravirtualized by the host + * CXL subsystem and require direct DPA region access from the guest. + */ + +#include +#include +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl->comp_reg_bar; +} + +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + resource_size_t bar_len, comp_end; + u32 nr_areas, cap_size; + int ret; + + if (!cxl) + return -ENOTTY; + + if (!info) + return -ENOTTY; + + if (info->argsz < minsz) + return -EINVAL; + + if (info->index != cxl->comp_reg_bar) + return -ENOTTY; + + /* + * The device state is not fully initialised; + * fall through to the default BAR handler. + */ + if (!cxl->comp_reg_size) + return -ENOTTY; + + bar_len = pci_resource_len(vdev->pdev, info->index); + comp_end = cxl->comp_reg_offset + cxl->comp_reg_size; + + /* + * Advertise the GPU/accelerator register windows as mmappable by + * carving the CXL component register block out of the BAR. The + * number of sparse areas depends on where the block sits: + * + * [A] comp block at BAR end [gpu_regs | comp_regs]: + * comp_reg_offset > 0 && comp_end == bar_len + * = 1 area: [0, comp_reg_offset) + * + * [B] comp block at BAR start [comp_regs | gpu_regs]: + * comp_reg_offset == 0 && comp_end < bar_len + * = 1 area: [comp_end, bar_len) + * + * [C] comp block in middle [gpu_regs | comp_regs | gpu_regs]: + * comp_reg_offset > 0 && comp_end < bar_len + * = 2 areas: [0, comp_reg_offset) and [comp_end, bar_len) + */ + if (cxl->comp_reg_offset > 0 && comp_end < bar_len) + nr_areas = 2; + else + nr_areas = 1; + + cap_size = struct_size(sparse, areas, nr_areas); + sparse = kzalloc(cap_size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + + if (nr_areas == 2) { + /* [C]: window before and after comp block */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + sparse->areas[1].offset = comp_end; + sparse->areas[1].size = bar_len - comp_end; + } else if (cxl->comp_reg_offset == 0) { + /* [B]: comp block at BAR start, window follows */ + sparse->areas[0].offset = comp_end; + sparse->areas[0].size = bar_len - comp_end; + } else { + /* [A]: comp block at BAR end, window precedes */ + sparse->areas[0].offset = 0; + sparse->areas[0].size = cxl->comp_reg_offset; + } + + ret = vfio_info_add_capability(caps, &sparse->header, cap_size); + kfree(sparse); + if (ret) + return ret; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = bar_len; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + return 0; +} + +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl->comp_reg_size) + return false; + + return req_start < cxl->comp_reg_offset + cxl->comp_reg_size && + req_start + req_len > cxl->comp_reg_offset; +} + +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + struct vfio_device_info_cap_cxl cxl_cap = {0}; + + if (!cxl) + return 0; + + /* + * Device is not fully initialised? + */ + if (WARN_ON(cxl->dpa_region_idx < 0 || cxl->comp_reg_region_idx < 0)) + return -ENODEV; + + /* Fill in from CXL device structure */ + cxl_cap.header.id = VFIO_DEVICE_INFO_CAP_CXL; + cxl_cap.header.version = 1; + /* + * COMP_REGS region starts at comp_reg_offset + CXL_CM_OFFSET within + * the BAR. This is the byte offset of the CXL.mem register area (where + * the CXL Capability Array Header lives) within the component register + * block. Userspace derives hdm_decoder_offset and hdm_count from the + * COMP_REGS region itself (CXL Capability Array traversal + HDMC read). + */ + cxl_cap.hdm_regs_offset = cxl->comp_reg_offset + CXL_CM_OFFSET; + cxl_cap.hdm_regs_bar_index = cxl->comp_reg_bar; + + if (cxl->precommitted) + cxl_cap.flags |= VFIO_CXL_CAP_FIRMWARE_COMMITTED; + if (cxl->cache_capable) + cxl_cap.flags |= VFIO_CXL_CAP_CACHE_CAPABLE; + + /* + * Populate absolute VFIO region indices so userspace can query them + * directly with VFIO_DEVICE_GET_REGION_INFO. + */ + cxl_cap.dpa_region_index = VFIO_PCI_NUM_REGIONS + cxl->dpa_region_idx; + cxl_cap.comp_regs_region_index = + VFIO_PCI_NUM_REGIONS + cxl->comp_reg_region_idx; + + return vfio_info_add_capability(caps, &cxl_cap.header, sizeof(cxl_cap)); +} + +/* + * Scope-based cleanup wrappers for the CXL resource APIs + */ +DEFINE_FREE(cxl_put_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_put_root_decoder(_T)) +DEFINE_FREE(cxl_dpa_free, struct cxl_endpoint_decoder *, if (!IS_ERR_OR_NULL(_T)) cxl_dpa_free(_T)) +DEFINE_FREE(cxl_unregister_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) cxl_unregister_region(_T)) + +/* + * vfio_cxl_create_device_state - Allocate and validate CXL device state + * + * Returns a pointer to the allocated vfio_pci_cxl_state on success, or + * ERR_PTR on failure. The allocation uses devm; the caller must call + * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release + * the resource before device unbind. Using devm_kfree() to undo a devm + * allocation early is explicitly supported by the devres API. + * + * The caller assigns vdev->cxl only after all setup steps succeed, preventing + * partially-initialised state from being visible through vdev->cxl on any + * failure path. + */ +static struct vfio_pci_cxl_state * +vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec) +{ + struct vfio_pci_cxl_state *cxl; + u16 cap_word; + u32 hdr1; + + /* Freed automatically when pdev->dev is released. */ + cxl = devm_cxl_dev_state_create(&pdev->dev, + CXL_DEVTYPE_DEVMEM, + pdev->dev.id, dvsec, + struct vfio_pci_cxl_state, + cxlds, false); + if (!cxl) + return ERR_PTR(-ENOMEM); + + pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1); + cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1); + + pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET, + &cap_word); + + /* + * Only handle vendor devices (class != 0x0502) with Mem_Capable set. + * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI + * sequence is needed before FLR. + */ + if (!FIELD_GET(CXL_DVSEC_CAP_MEM_CAPABLE, cap_word) || + (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) { + devm_kfree(&pdev->dev, cxl); + return ERR_PTR(-ENODEV); + } + + cxl->cache_capable = FIELD_GET(CXL_DVSEC_CAP_CACHE_CAPABLE, cap_word); + cxl->dpa_region_idx = -1; + cxl->comp_reg_region_idx = -1; + + return cxl; +} + +static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct cxl_register_map *map = &cxl->cxlds.reg_map; + resource_size_t offset, bar_offset, size; + struct pci_dev *pdev = vdev->pdev; + void __iomem *base; + int ret; + u8 count; + u8 bar; + + if (WARN_ON_ONCE(!pci_is_enabled(pdev))) + return -EINVAL; + + /* Find component register block via Register Locator DVSEC */ + ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map); + if (ret) + return ret; + + /* + * Request the region and map. This is a transient mapping + * used only to probe register capabilities; released immediately + * after cxl_probe_component_regs() returns. + */ + if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe")) + return -EBUSY; + + base = ioremap(map->resource, map->max_size); + if (!base) { + ret = -ENOMEM; + goto failed_release; + } + + /* Probe component register capabilities */ + cxl_probe_component_regs(&pdev->dev, base, &map->component_map); + + /* Check if HDM decoder was found */ + if (!map->component_map.hdm_decoder.valid) { + ret = -ENODEV; + goto failed_unmap; + } + + pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n", + map->component_map.hdm_decoder.offset, + map->component_map.hdm_decoder.size); + + /* Get HDM register info */ + ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size); + if (ret) + goto failed_unmap; + + if (!count || !size) { + ret = -ENODEV; + goto failed_unmap; + } + + cxl->hdm_count = count; + /* + * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + + * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET before + * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset). + * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem + * register area start, which is where comp_reg_virt[0] is anchored. + * The physical BAR address for hdm_iobase is recovered by adding + * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs(). + */ + cxl->hdm_reg_offset = offset - CXL_CM_OFFSET; + cxl->hdm_reg_size = size; + + ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset); + if (ret) + goto failed_unmap; + + cxl->comp_reg_bar = bar; + cxl->comp_reg_offset = bar_offset; + cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + ret = vfio_cxl_setup_virt_regs(vdev, cxl, base); + iounmap(base); + release_mem_region(map->resource, map->max_size); + if (ret) + return ret; + + return 0; + +failed_unmap: + iounmap(base); +failed_release: + release_mem_region(map->resource, map->max_size); + + return ret; +} + +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size) +{ + resource_size_t max_size; + + WARN_ON(cxl->precommitted); + + struct cxl_root_decoder *cxlrd __free(cxl_put_root_decoder) = + cxl_get_hpa_freespace(cxl->cxlmd, 1, + CXL_DECODER_F_RAM | CXL_DECODER_F_TYPE2, + &max_size); + if (IS_ERR(cxlrd)) + return PTR_ERR(cxlrd); + + /* Insufficient HPA space; cxlrd freed automatically by __free() */ + if (max_size < size) + return -ENOSPC; + + struct cxl_endpoint_decoder *cxled __free(cxl_dpa_free) = + cxl_request_dpa(cxl->cxlmd, CXL_PARTMODE_RAM, size); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + + struct cxl_region *region __free(cxl_unregister_region) = + cxl_create_region(cxlrd, &cxled, 1); + if (IS_ERR(region)) + return PTR_ERR(region); + + /* All operations succeeded; transfer ownership to cxl state */ + cxl->cxlrd = no_free_ptr(cxlrd); + cxl->cxled = no_free_ptr(cxled); + cxl->region = no_free_ptr(region); + + return 0; +} + +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl) +{ + if (!cxl->region) + return; + + cxl_unregister_region(cxl->region); + cxl->region = NULL; + + if (!cxl->precommitted) { + cxl_dpa_free(cxl->cxled); + cxl_put_root_decoder(cxl->cxlrd); + } + + cxl->cxled = NULL; + cxl->cxlrd = NULL; +} + +static int vfio_cxl_create_region_helper(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + struct pci_dev *pdev = vdev->pdev; + struct range range; + int ret; + + if (cxl->precommitted) { + struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + + cxled = cxl_get_committed_decoder(cxl->cxlmd, ®ion); + if (IS_ERR(cxled)) + return PTR_ERR(cxled); + cxl->cxled = cxled; + cxl->region = region; + } else { + ret = vfio_cxl_create_cxl_region(cxl, capacity); + if (ret) + return ret; + } + + if (!cxl->region) { + pci_err(pdev, "Failed to create CXL region\n"); + ret = -ENODEV; + goto failed; + } + + ret = cxl_get_region_range(cxl->region, &range); + if (ret) + goto failed; + + cxl->region_hpa = range.start; + cxl->region_size = range_len(&range); + + pci_dbg(pdev, "CXL region: HPA 0x%llx size %lu MB\n", + cxl->region_hpa, cxl->region_size >> 20); + + return 0; + +failed: + if (cxl->region) { + cxl_unregister_region(cxl->region); + cxl->region = NULL; + } + + cxl->cxled = NULL; + cxl->cxlrd = NULL; + + return ret; +} + +static int vfio_cxl_create_memdev(struct vfio_pci_cxl_state *cxl, + resource_size_t capacity) +{ + int ret; + + ret = cxl_set_capacity(&cxl->cxlds, capacity); + if (ret) + return ret; + + cxl->cxlmd = devm_cxl_add_memdev(&cxl->cxlds, NULL); + if (IS_ERR(cxl->cxlmd)) + return PTR_ERR(cxl->cxlmd); + + return 0; +} + +/* + * Free CXL state early on probe failure. devm_kfree() on a live devres + * allocation removes it from the list immediately, so the normal devres + * teardown at unbind time won't double-free it. + */ +static void vfio_cxl_dev_state_free(struct pci_dev *pdev, + struct vfio_pci_cxl_state *cxl) +{ + devm_kfree(&pdev->dev, cxl); +} + +/** + * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific + * CXL.mem device + * @vdev: VFIO PCI device + * + * Called from vfio_pci_core_register_device(). Detects CXL DVSEC capability + * and initializes CXL features. On failure vdev->cxl remains NULL and the + * device operates as a standard PCI device. + */ +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_cxl_state *cxl; + resource_size_t capacity = 0; + u16 dvsec; + int ret; + + /* Honor the user opt-out decision */ + if (vdev->disable_cxl) + return; + + if (!pcie_is_cxl(pdev)) + return; + + dvsec = pci_find_dvsec_capability(pdev, + PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + if (!dvsec) + return; + + /* + * CXL DVSEC found: any failure from here is a hard probe error on + * a confirmed CXL-capable device, not a silent non-CXL fallback. + * Warn the operator so misconfiguration is visible. + */ + cxl = vfio_cxl_create_device_state(pdev, dvsec); + if (IS_ERR(cxl)) { + if (PTR_ERR(cxl) != -ENODEV) + pci_warn(pdev, + "vfio-cxl: CXL device state allocation failed: %ld\n", + PTR_ERR(cxl)); + return; + } + + /* + * Required for ioremap of the component register block and + * calls to cxl_probe_component_regs(). + */ + ret = pci_enable_device_mem(pdev); + if (ret) { + pci_warn(pdev, + "vfio-cxl: pci_enable_device_mem failed: %d\n", ret); + goto free_cxl; + } + + ret = vfio_cxl_setup_regs(vdev, cxl); + if (ret) { + pci_warn(pdev, + "vfio-cxl: HDM register probing failed: %d\n", ret); + pci_disable_device(pdev); + goto free_cxl; + } + + cxl->cxlds.media_ready = !cxl_await_range_active(&cxl->cxlds); + if (!cxl->cxlds.media_ready) { + pci_warn(pdev, "CXL media not ready\n"); + pci_disable_device(pdev); + goto regs_failed; + } + + /* + * Take the single authoritative HDM decoder snapshot now that + * MEM_ACTIVE is confirmed and BAR memory is still enabled. Using + * readl() per-dword ensures correct MMIO serialisation and captures + * the final firmware-written values for all fields including SIZE_HIGH, + * which firmware commits to the BAR at MEM_ACTIVE time. + */ + vfio_cxl_reinit_comp_regs(cxl); + + pci_disable_device(pdev); + + capacity = vfio_cxl_read_committed_decoder_size(vdev, cxl); + if (capacity == 0) { + /* + * TODO: Add handling for devices which do not have + * firmware pre-committed decoders + */ + pci_info(pdev, "Uncommitted region size must be configured via sysfs before bind\n"); + goto regs_failed; + } + + cxl->precommitted = true; + cxl->dpa_size = capacity; + + pci_dbg(pdev, "Device capacity: %llu MB\n", capacity >> 20); + + ret = vfio_cxl_create_memdev(cxl, capacity); + if (ret) { + pci_warn(pdev, "Failed to create memdev\n"); + goto regs_failed; + } + + ret = vfio_cxl_create_region_helper(vdev, cxl, capacity); + if (ret) + goto regs_failed; + + /* + * Register probing succeeded. Assign vdev->cxl now so that + * all subsequent helpers can access state via vdev->cxl. + * All failure paths below clear vdev->cxl before calling + * vfio_cxl_dev_state_free(). + */ + vdev->cxl = cxl; + + return; + +regs_failed: + vfio_cxl_clean_virt_regs(cxl); + +free_cxl: + vfio_cxl_dev_state_free(pdev, cxl); +} + +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl) + return; + + vfio_cxl_clean_virt_regs(cxl); + vfio_cxl_destroy_cxl_region(cxl); +} + +static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) +{ + struct vfio_pci_region *region = vmf->vma->vm_private_data; + struct vfio_pci_cxl_state *cxl = region->data; + unsigned long pgoff; + unsigned long pfn; + + if (!READ_ONCE(cxl->region_active)) + return VM_FAULT_SIGBUS; + + pgoff = vmf->pgoff & + ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (pgoff >= (cxl->region_size >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + + pfn = PHYS_PFN(cxl->region_hpa) + pgoff; + + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); +} + +static const struct vm_operations_struct vfio_cxl_region_vm_ops = { + .fault = vfio_cxl_region_vm_fault, +}; + +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u64 req_len, pgoff, end; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + if (end > cxl->region_size) + return -EINVAL; + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + + vma->vm_ops = &vfio_cxl_region_vm_ops; + vma->vm_private_data = region; + + return 0; +} + +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ + return vdev->cxl && pci_cxl_reset_capable(vdev->pdev); +} + +/* + * vfio_cxl_prepare_reset - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_device *core_vdev = &vdev->vdev; + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); +} + +/* + * vfio_cxl_enable_memory_space - ensure PCI Memory Space is on before BAR reads. + * + * A reset caller may disable Memory Space to quiesce device DMA before + * issuing the reset. If a guest wrote PCI_COMMAND with Memory Space cleared + * before the FLR; pci_dev_save_and_disable() will capture it disabled and + * restores it that way. This can result in Memory Space remains disabled on + * return. Accessing a BAR with Memory Space disabled produces an Unsupported + * Request completion; on platforms that promote UR to a fatal error this fires + * DPC. + */ +static void vfio_cxl_enable_memory_space(struct vfio_pci_core_device *vdev) +{ + u16 cmd; + + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(vdev->pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); +} + +/* + * vfio_cxl_reinit_hdm_shadow - reinitialise comp_reg_virt, preserving GPA bases. + * + * reinit_comp_regs() mirrors post-reset hardware state (all-zeros) into + * comp_reg_virt[], including the HDM decoder BASE registers. For decoders + * that the device manager committed with a guest-physical address before the + * reset, pci_dev_restore() will re-commit the hardware decoders with the + * host-physical base. The kernel provides no notification that BASE was + * cleared during reinit. Snapshot the GPA bases before reinit and restore + * them after so the emulated decoder remains consistent with what the device + * manager set. + * + * Called with memory_lock write side held (from vfio_cxl_finish_reset). + */ +static void vfio_cxl_reinit_hdm_shadow(struct vfio_pci_cxl_state *cxl) +{ + __le32 saved_lo[16] = {}, saved_hi[16] = {}; + u8 n, count = min_t(u8, cxl->hdm_count, ARRAY_SIZE(saved_lo)); + + if (cxl->comp_reg_virt) { + for (n = 0; n < count; n++) { + saved_lo[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)); + saved_hi[n] = *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)); + } + } + + vfio_cxl_reinit_comp_regs(cxl); + + if (cxl->comp_reg_virt) { + for (n = 0; n < count; n++) { + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_LOW_OFFSET(n)) = saved_lo[n]; + *hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(n)) = saved_hi[n]; + } + } +} + +/* + * vfio_cxl_finish_reset - Re-enable DPA region after reset. + * + * Must be called with vdev->memory_lock held for writing. Re-reads the + * HDM decoder state from hardware and sets region_active so that + * subsequent I/O to the region is permitted again. + */ +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + + vfio_cxl_enable_memory_space(vdev); + + /* + * Re-initialise the emulated HDM comp_reg_virt[] from hardware, + * preserving the GPA decoder bases set by the device manager. + */ + vfio_cxl_reinit_hdm_shadow(cxl); + + /* + * Only re-enable the DPA mmap if the hardware has actually + * re-committed decoder 0 after FLR. Read the COMMITTED bit from the + * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR + * hardware state, not stale pre-reset state. + * + * If COMMITTED is 0 (slow firmware re-commit path), leave + * region_active=false. Guest faults will return VM_FAULT_SIGBUS + * until the decoder is re-committed and the region is re-enabled. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + /* + * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset + * (now CXL.mem-relative) plus the within-HDM-block offset. + */ + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } +} + +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_cxl_state *cxl = core_dev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count || pos >= cxl->region_size) + return 0; + + /* + * Guard against access after a failed reset (region_active=false) + * or a release race (region_vaddr=NULL). Either condition means + * the memremap'd window is no longer valid; touching it would produce + * a Synchronous External Abort. Return -EIO so the caller gets a + * clean error rather than a kernel oops. + */ + if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr) + return -EIO; + + count = min(count, (size_t)(cxl->region_size - pos)); + + if (iswrite) { + if (copy_from_user(cxl->region_vaddr + pos, buf, count)) + return -EFAULT; + } else { + if (copy_to_user(buf, cxl->region_vaddr + pos, count)) + return -EFAULT; + } + + return count; +} + +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_device *core_vdev = &vdev->vdev; + struct vfio_pci_cxl_state *cxl = region->data; + + /* + * Deactivate the region before removing user mappings so that any + * fault handler racing the release returns VM_FAULT_SIGBUS rather + * than inserting a PFN into an unmapped region. + */ + WRITE_ONCE(cxl->region_active, false); + + /* + * Remove all user mappings of the DPA region while the device is + * still alive. + */ + if (cxl->dpa_region_idx >= 0) + unmap_mapping_range(core_vdev->inode->i_mapping, + VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_NUM_REGIONS + + cxl->dpa_region_idx), + cxl->region_size, true); + + if (cxl->region_vaddr) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + } +} + +static const struct vfio_pci_regops vfio_cxl_regops = { + .rw = vfio_cxl_region_rw, + .mmap = vfio_cxl_region_mmap, + .release = vfio_cxl_region_release, +}; + +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags; + int ret; + + if (!cxl) + return -ENODEV; + + if (!cxl->region || cxl->region_vaddr) + return -ENODEV; + + /* + * CXL device memory is RAM, not MMIO. Use memremap() rather than + * ioremap_cache() so the correct memory-mapping API is used. + * The WB attribute matches the cache-coherent nature of CXL.mem. + */ + cxl->region_vaddr = memremap(cxl->region_hpa, cxl->region_size, + MEMREMAP_WB); + if (!cxl->region_vaddr) + return -ENOMEM; + + flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL, + &vfio_cxl_regops, + cxl->region_size, flags, + cxl); + if (ret) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + return ret; + } + + /* + * Cache the vdev->region[] index before activating the region. + * vfio_pci_core_register_dev_region() placed the new entry at + * vdev->region[num_regions - 1] and incremented num_regions. + * vfio_cxl_prepare_reset() uses this to avoid scanning + * vdev->region[] on every FLR. + */ + cxl->dpa_region_idx = vdev->num_regions - 1; + + vfio_cxl_reinit_comp_regs(cxl); + + WRITE_ONCE(cxl->region_active, true); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_cxl_region); + +/** + * vfio_cxl_unregister_cxl_region - Undo vfio_cxl_register_cxl_region() + * @vdev: VFIO PCI device + * + * Marks the DPA region inactive and resets dpa_region_idx. + * Does NOT touch CXL subsystem state (cxl->region, cxl->cxled, cxl->cxlrd). + * The caller must call vfio_cxl_destroy_cxl_region() separately to release + * those objects. + */ +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + if (!cxl || cxl->dpa_region_idx < 0) + return; + + WRITE_ONCE(cxl->region_active, false); + + cxl->dpa_region_idx = -1; +} +EXPORT_SYMBOL_GPL(vfio_cxl_unregister_cxl_region); + +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c new file mode 100644 index 0000000000000..250407e8bf701 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "../vfio_pci_priv.h" +#include "vfio_cxl_priv.h" + +/* + * comp_reg_virt[] shadow layout: + * Covers the full CXL.mem register area (starting at CXL_CM_OFFSET + * within the component register block). Index 0 is the CXL Capability + * Array Header; the HDM decoder block starts at index + * hdm_reg_offset / sizeof(__le32). + * + * Register layout within the HDM block (CXL spec 4.0 8.2.4.20 CXL HDM Decoder + * Capability Structure): + * 0x00: HDM Decoder Capability + * 0x04: HDM Decoder Global Control + * 0x08: (reserved) + * 0x0c: (reserved) + * For each decoder N (N=0..hdm_count-1), at base 0x10 + N*0x20: + * +0x00: BASE_LO + * +0x04: BASE_HI + * +0x08: SIZE_LO + * +0x0c: SIZE_HI + * +0x10: CTRL + * +0x14: TARGET_LIST_LO + * +0x18: TARGET_LIST_HI + * +0x1c: (reserved) + */ + +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +{ + /* + * hdm_off is a byte offset within the HDM decoder block. + * comp_reg_virt covers the CXL.mem register area starting at + * CXL_CM_OFFSET within the component register block. + * hdm_reg_offset is CXL.mem-relative, so adding hdm_reg_offset + * gives the correct index into comp_reg_virt[]. + */ + return &cxl->comp_reg_virt[(cxl->hdm_reg_offset + hdm_off) / + sizeof(__le32)]; +} + +static ssize_t virt_hdm_rev_reg_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + /* Discard writes on reserved registers. */ + return size; +} + +static ssize_t hdm_decoder_n_lo_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bits [27:0] are reserved. */ + new_val &= ~CXL_HDM_DECODER_BASE_LO_RESERVED_MASK; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +static ssize_t hdm_decoder_global_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 size) +{ + u32 hdm_gcap; + u32 new_val = le32_to_cpu(*val32); + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + /* Bit [31:2] are reserved. */ + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK; + + /* Poison On Decode Error Enable (bit 0) is RO=0 if not supported. */ + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + if (!(hdm_gcap & CXL_HDM_DECODER_POISON_ON_DECODE_ERR)) + new_val &= ~CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT; + + *hdm_reg_ptr(vdev->cxl, CXL_HDM_DECODER_CTRL_OFFSET) = + cpu_to_le32(new_val); + + return size; +} + +/** + * hdm_decoder_n_ctrl_write - Write handler for HDM decoder CTRL register. + * @vdev: VFIO PCI core device + * @val32: New register value supplied by userspace (little-endian) + * @offset: Byte offset within the HDM block for this decoder's CTRL register + * @size: Access size in bytes; must equal CXL_REG_SIZE_DWORD + * + * The COMMIT bit (bit 9) is the key: setting it requests the hardware to + * lock the decoder. The emulated COMMITTED bit (bit 10) mirrors COMMIT + * immediately to allow QEMU's notify_change to detect the transition and + * map/unmap the DPA MemoryRegion in the guest address space. + * + * Note: the actual hardware HDM decoder programming (writing the real + * BASE/SIZE with host physical addresses) happens in the QEMU notify_change + * callback BEFORE this write reaches the hardware. This ordering is + * correct because vfio_region_write() calls notify_change() first. + * + * Return: @size on success, %-EINVAL if @size is not %CXL_REG_SIZE_DWORD. + */ +static ssize_t hdm_decoder_n_ctrl_write(struct vfio_pci_core_device *vdev, + const __le32 *val32, u64 offset, u64 size) +{ + u32 hdm_gcap; + u32 ro_mask = CXL_HDM_DECODER_CTRL_RO_BITS_MASK; + u32 rev_mask = CXL_HDM_DECODER_CTRL_RESERVED_MASK; + u32 new_val = le32_to_cpu(*val32); + u32 cur_val; + + if (WARN_ON_ONCE(size != CXL_REG_SIZE_DWORD)) + return -EINVAL; + + cur_val = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, offset)); + if (cur_val & CXL_HDM_DECODER0_CTRL_LOCK) { + if (new_val & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + + /* LOCK_0 only: preserve all other bits, clear LOCK */ + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32( + cur_val & ~CXL_HDM_DECODER0_CTRL_LOCK); + return size; + } + + hdm_gcap = le32_to_cpu(*hdm_reg_ptr(vdev->cxl, + CXL_HDM_DECODER_CAP_OFFSET)); + ro_mask |= CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO; + rev_mask |= CXL_HDM_DECODER_CTRL_DEVICE_RESERVED; + + if (!(hdm_gcap & CXL_HDM_DECODER_UIO_CAPABLE)) + rev_mask |= CXL_HDM_DECODER_CTRL_UIO_RESERVED; + + new_val &= ~rev_mask; + cur_val &= ro_mask; + new_val = (new_val & ~ro_mask) | cur_val; + + /* + * Mirror COMMIT to COMMITTED immediately in the emulated state. + */ + if (new_val & CXL_HDM_DECODER0_CTRL_COMMIT) + new_val |= CXL_HDM_DECODER0_CTRL_COMMITTED; + else + new_val &= ~CXL_HDM_DECODER0_CTRL_COMMITTED; + + *hdm_reg_ptr(vdev->cxl, offset) = cpu_to_le32(new_val); + + return size; +} + +/* + * Dispatch table for COMP_REGS region writes. Indexed by byte offset within + * the HDM decoder block. Returns the appropriate write handler. + * + * Layout: + * 0x00 HDM Decoder Capability (RO) + * 0x04 HDM Global Control (RW with reserved masking) + * 0x08-0x0f (reserved) (ignored) + * Per decoder N, base = 0x10 + N*0x20: + * base+0x00 BASE_LO (RW, [27:0] reserved) + * base+0x04 BASE_HI (RW) + * base+0x08 SIZE_LO (RW, [27:0] reserved) + * base+0x0c SIZE_HI (RW) + * base+0x10 CTRL (RW, complex rules) + * base+0x14 TARGET_LIST_LO (ignored for Type-2) + * base+0x18 TARGET_LIST_HI (ignored for Type-2) + * base+0x1c (reserved) (ignored) + */ +static ssize_t comp_regs_dispatch_write(struct vfio_pci_core_device *vdev, + u32 off, const __le32 *val32, u32 size) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 dec_base, dec_off; + + /* HDM Decoder Capability (0x00): RO */ + if (off == CXL_HDM_DECODER_CAP_OFFSET) + return size; + + /* HDM Global Control (0x04) */ + if (off == CXL_HDM_DECODER_CTRL_OFFSET) + return hdm_decoder_global_ctrl_write(vdev, val32, size); + + /* + * Offsets 0x08-0x0f are reserved per CXL 4.0 Table 8-115. + * Per-decoder registers start at 0x10, stride 0x20 + */ + if (off < CXL_HDM_DECODER_FIRST_BLOCK_OFFSET) + return size; /* reserved gap */ + + dec_base = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET; + /* + * Reject accesses beyond the last implemented HDM decoder. + * Without this check an out-of-bounds offset would silently + * corrupt comp_reg_virt[] memory past the end of the allocation. + */ + if ((off - dec_base) / CXL_HDM_DECODER_BLOCK_STRIDE >= cxl->hdm_count) + return size; + + dec_off = (off - dec_base) % CXL_HDM_DECODER_BLOCK_STRIDE; + + switch (dec_off) { + case CXL_HDM_DECODER_N_BASE_LOW_OFFSET: /* BASE_LO */ + case CXL_HDM_DECODER_N_SIZE_LOW_OFFSET: /* SIZE_LO */ + return hdm_decoder_n_lo_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_BASE_HIGH_OFFSET: /* BASE_HI */ + case CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET: /* SIZE_HI */ + { + /* Full 32-bit write, no reserved bits; frozen when COMMIT_LOCK set */ + u32 ctrl_off = off - dec_off + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + return size; + *hdm_reg_ptr(cxl, off) = *val32; + return size; + } + case CXL_HDM_DECODER_N_CTRL_OFFSET: /* CTRL */ + return hdm_decoder_n_ctrl_write(vdev, val32, off, size); + case CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET: + case CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET: + case CXL_HDM_DECODER_N_REV_OFFSET: + return virt_hdm_rev_reg_write(vdev, val32, off, size); + default: + return size; + } +} + +/* + * vfio_cxl_comp_regs_rw - regops rw handler for + * VFIO_REGION_SUBTYPE_CXL_COMP_REGS. + * + * Reads return the emulated HDM state (comp_reg_virt[]). + * Writes go through comp_regs_dispatch_write() for bit-field enforcement. + * Only 4-byte aligned 4-byte accesses are supported (hardware requirement). + */ +static ssize_t vfio_cxl_comp_regs_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + ssize_t ret = 0; + size_t done = 0; + + if (!count) + return 0; + + /* Clamp to total region size: cap array prefix + HDM block */ + if (pos >= cxl->hdm_reg_offset + cxl->hdm_reg_size) + return -EINVAL; + count = min(count, + (size_t)(cxl->hdm_reg_offset + cxl->hdm_reg_size - pos)); + + /* + * Serialise against vfio_cxl_reinit_hdm_shadow(), which holds + * memory_lock write-side while it saves, zeroes, and restores + * comp_reg_virt[] during reset. Without this read lock a concurrent + * COMP_REGS write can land between the save snapshot and the restore, + * causing the restore to silently overwrite it. A concurrent read + * can observe the array mid-rebuild. + */ + down_read(&vdev->memory_lock); + + while (done < count) { + u32 sz = count - done; + u32 off = pos + done; + __le32 v; + + /* Enforce exactly 4-byte, 4-byte-aligned accesses */ + if (sz != CXL_REG_SIZE_DWORD || (off & 0x3)) { + ret = done ? (ssize_t)done : -EINVAL; + goto out_unlock; + } + + if (iswrite) { + if (off < cxl->hdm_reg_offset) { + /* Cap array area is read-only; discard writes */ + done += sizeof(v); + continue; + } + if (copy_from_user(&v, buf + done, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } + comp_regs_dispatch_write(vdev, + off - cxl->hdm_reg_offset, + &v, sizeof(v)); + } else { + /* Read from extended buffer - covers cap array and HDM */ + v = cxl->comp_reg_virt[off / sizeof(__le32)]; + if (copy_to_user(buf + done, &v, sizeof(v))) { + ret = done ? (ssize_t)done : -EFAULT; + goto out_unlock; + } + } + done += sizeof(v); + } + + ret = done; + *ppos += done; +out_unlock: + up_read(&vdev->memory_lock); + return ret; +} + +static void vfio_cxl_comp_regs_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + /* comp_reg_virt is freed in vfio_cxl_clean_virt_regs() */ +} + +static const struct vfio_pci_regops vfio_cxl_comp_regs_ops = { + .rw = vfio_cxl_comp_regs_rw, + .release = vfio_cxl_comp_regs_release, +}; + +/* + * vfio_cxl_setup_virt_regs - Allocate emulated HDM register state. + * + * Allocates comp_reg_virt as a compact __le32 array covering only + * hdm_reg_size bytes of HDM decoder registers. The initial values + * are read from hardware via the BAR ioremap established by the caller. + * + * DVSEC state is accessed via vdev->vconfig (see the following patch). + */ +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base) +{ + size_t total_size, nregs, i; + + if (WARN_ON(!cxl->hdm_reg_size)) + return -EINVAL; + + total_size = cxl->hdm_reg_offset + cxl->hdm_reg_size; + + if (pci_resource_len(vdev->pdev, cxl->comp_reg_bar) < + cxl->comp_reg_offset + CXL_CM_OFFSET + total_size) + return -ENODEV; + + nregs = total_size / sizeof(__le32); + cxl->comp_reg_virt = kcalloc(nregs, sizeof(__le32), GFP_KERNEL); + if (!cxl->comp_reg_virt) + return -ENOMEM; + + /* + * Snapshot the CXL.mem register area from the caller's mapping. + * cap_base maps the component register block from comp_reg_offset. + * The CXL.mem registers start at CXL_CM_OFFSET (= 0x1000) within that + * block; reading from cap_base + CXL_CM_OFFSET ensures comp_reg_virt[0] + * holds the CXL Capability Array Header required by guest drivers. + */ + for (i = 0; i < nregs; i++) + cxl->comp_reg_virt[i] = + cpu_to_le32(readl(cap_base + CXL_CM_OFFSET + + i * sizeof(__le32))); + + /* + * Establish persistent mapping; kept alive until + * vfio_cxl_clean_virt_regs(). + */ + cxl->hdm_iobase = ioremap(pci_resource_start(vdev->pdev, + cxl->comp_reg_bar) + + cxl->comp_reg_offset + CXL_CM_OFFSET + + cxl->hdm_reg_offset, + cxl->hdm_reg_size); + if (!cxl->hdm_iobase) { + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; + return -ENOMEM; + } + + return 0; +} + +/* + * vfio_cxl_read_committed_decoder_size - Extract committed DPA capacity from + * comp_reg_virt[]. + * + * Called from probe context after vfio_cxl_reinit_comp_regs() has taken the + * post-MEM_ACTIVE readl() snapshot and patched SIZE_HIGH/SIZE_LOW from DVSEC. + * comp_reg_virt[] is already correct at this point; no hardware access needed. + * + * Returns the committed DPA capacity in bytes, or 0 if the decoder is not + * committed. + */ +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl) +{ + struct pci_dev *pdev = vdev->pdev; + resource_size_t capacity; + u32 ctrl, sz_hi, sz_lo; + + if (WARN_ON(!cxl || !cxl->comp_reg_virt)) + return 0; + + ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_CTRL_OFFSET(0))); + sz_hi = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(0))); + sz_lo = le32_to_cpu(*hdm_reg_ptr(cxl, CXL_HDM_DECODER0_SIZE_LOW_OFFSET(0))); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) { + pci_dbg(pdev, + "vfio_cxl: decoder0 not committed: ctrl=0x%08x\n", + ctrl); + return 0; + } + + capacity = ((resource_size_t)sz_hi << 32) | (sz_lo & GENMASK(31, 28)); + + pci_dbg(pdev, + "vfio_cxl: decoder0 committed: sz_hi=0x%08x sz_lo=0x%08x capacity=0x%llx\n", + sz_hi, sz_lo, (unsigned long long)capacity); + + return capacity; +} + +/* + * Called with memory_lock write side held (from vfio_cxl_reinit_hdm_shadow). + * Uses the pre-established hdm_iobase, no ioremap() under the lock, + * which would deadlock on PREEMPT_RT where ioremap() can sleep. + */ +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl) +{ + size_t i, nregs; + u32 n; + + if (!cxl || !cxl->comp_reg_virt || !cxl->hdm_iobase) + return; + + nregs = cxl->hdm_reg_size / sizeof(__le32); + + for (i = 0; i < nregs; i++) + *hdm_reg_ptr(cxl, i * sizeof(__le32)) = + cpu_to_le32(readl(cxl->hdm_iobase + + i * sizeof(__le32))); + + /* + * For firmware-committed decoders, clear COMMIT_LOCK (bit 8) and zero + * BASE in comp_reg_virt[] so QEMU can write the correct guest GPA via + * setup_locked_hdm() before guest DPA access begins. + * + * Check the COMMITTED bit (bit 10) directly from the freshly-snapshotted + * ctrl register rather than relying on cxl->precommitted. At probe time + * this function is called before cxl->precommitted is set (it is set + * after vfio_cxl_read_committed_decoder_size() succeeds), so using + * cxl->precommitted here would silently skip the LOCK clearing and leave + * the hardware HPA in comp_reg_virt[]. + */ + for (n = 0; n < cxl->hdm_count; n++) { + u32 ctrl_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_CTRL_OFFSET; + u32 base_lo_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_LOW_OFFSET; + u32 base_hi_off = CXL_HDM_DECODER_FIRST_BLOCK_OFFSET + + n * CXL_HDM_DECODER_BLOCK_STRIDE + + CXL_HDM_DECODER_N_BASE_HIGH_OFFSET; + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, ctrl_off)); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + continue; + + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) { + *hdm_reg_ptr(cxl, ctrl_off) = + cpu_to_le32(ctrl & + ~CXL_HDM_DECODER0_CTRL_LOCK); + *hdm_reg_ptr(cxl, base_lo_off) = 0; + *hdm_reg_ptr(cxl, base_hi_off) = 0; + } + } +} + +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl) +{ + if (cxl->hdm_iobase) { + iounmap(cxl->hdm_iobase); + cxl->hdm_iobase = NULL; + } + kfree(cxl->comp_reg_virt); + cxl->comp_reg_virt = NULL; +} + +/* + * vfio_cxl_register_comp_regs_region - Register the COMP_REGS device region. + * + * Exposes the emulated HDM decoder register state as a VFIO device region + * with type VFIO_REGION_SUBTYPE_CXL_COMP_REGS. QEMU attaches a + * notify_change callback to this region to intercept HDM COMMIT writes + * and map the DPA MemoryRegion at the appropriate GPA. + * + * The region is read+write only (no mmap) to ensure all accesses pass + * through comp_regs_dispatch_write() for proper bit-field enforcement. + */ +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + int ret; + + if (!cxl || !cxl->comp_reg_virt) + return -ENODEV; + + ret = vfio_pci_core_register_dev_region(vdev, + PCI_VENDOR_ID_CXL | + VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_CXL_COMP_REGS, + &vfio_cxl_comp_regs_ops, + cxl->hdm_reg_offset + + cxl->hdm_reg_size, flags, cxl); + if (!ret) + cxl->comp_reg_region_idx = vdev->num_regions - 1; + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_cxl_register_comp_regs_region); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h new file mode 100644 index 0000000000000..611ef793006c6 --- /dev/null +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common infrastructure for CXL Type-2 device variant drivers + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef __LINUX_VFIO_CXL_PRIV_H +#define __LINUX_VFIO_CXL_PRIV_H + +#include +#include +#include + +/* CXL device state embedded in vfio_pci_core_device */ +struct vfio_pci_cxl_state { + struct cxl_dev_state cxlds; + struct cxl_memdev *cxlmd; + struct cxl_root_decoder *cxlrd; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *region; + resource_size_t region_hpa; + size_t region_size; + void *region_vaddr; + resource_size_t hdm_reg_offset; + size_t hdm_reg_size; + resource_size_t comp_reg_offset; + size_t comp_reg_size; + __le32 *comp_reg_virt; + size_t dpa_size; + void __iomem *hdm_iobase; + int dpa_region_idx; + int comp_reg_region_idx; + u16 dvsec_len; + u8 hdm_count; + u8 comp_reg_bar; + bool cache_capable; + bool precommitted; + bool region_active; +}; + +/* Register access sizes */ +#define CXL_REG_SIZE_WORD 2 +#define CXL_REG_SIZE_DWORD 4 + +/* HDM Decoder - register offsets (CXL 4.0 Table 8-115) */ +#define CXL_HDM_DECODER_GLOBAL_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_FIRST_BLOCK_OFFSET 0x10 +#define CXL_HDM_DECODER_BLOCK_STRIDE 0x20 +#define CXL_HDM_DECODER_N_BASE_LOW_OFFSET 0x0 +#define CXL_HDM_DECODER_N_BASE_HIGH_OFFSET 0x4 +#define CXL_HDM_DECODER_N_SIZE_LOW_OFFSET 0x8 +#define CXL_HDM_DECODER_N_SIZE_HIGH_OFFSET 0xc +#define CXL_HDM_DECODER_N_CTRL_OFFSET 0x10 +#define CXL_HDM_DECODER_N_TARGET_LIST_LOW_OFFSET 0x14 +#define CXL_HDM_DECODER_N_TARGET_LIST_HIGH_OFFSET 0x18 +#define CXL_HDM_DECODER_N_REV_OFFSET 0x1c + +/* + * HDM Decoder N Control emulation masks. + * + * Single-bit hardware definitions are in as + * CXL_HDM_DECODER0_CTRL_* (bits 0-14) and CXL_HDM_DECODER_*_CAP. + * The masks below express emulation policy for a CXL.mem device. + */ +#define CXL_HDM_DECODER_CTRL_RO_BITS_MASK (BIT(10) | BIT(11)) +#define CXL_HDM_DECODER_CTRL_RESERVED_MASK (BIT(15) | GENMASK(31, 28)) +#define CXL_HDM_DECODER_CTRL_DEVICE_BITS_RO BIT(12) +#define CXL_HDM_DECODER_CTRL_DEVICE_RESERVED (GENMASK(19, 16) | GENMASK(23, 20)) +#define CXL_HDM_DECODER_CTRL_UIO_RESERVED (BIT(14) | GENMASK(27, 24)) +/* + * bit 13 (BI) is RsvdP for devices without CXL.cache (Cache_Capable=0). + * HDM-D (CXL.mem only) decoders must not have BI set by the guest. + */ +#define CXL_HDM_DECODER_CTRL_BI_RESERVED BIT(13) +#define CXL_HDM_DECODER_BASE_LO_RESERVED_MASK GENMASK(27, 0) + +#define CXL_HDM_DECODER_GLOBAL_CTRL_RESERVED_MASK GENMASK(31, 2) +#define CXL_HDM_DECODER_GLOBAL_CTRL_POISON_EN_BIT BIT(0) + +/* + * DVSEC register offsets and per-bit hardware definitions are in + * as CXL_DVSEC_*. The masks below encode + * emulation policy: which bits to ignore, which to preserve separately + * from their raw hardware state. + */ +/* DVSEC Control (0x0C): bits 13 (RsvdP) and 15 (RsvdP) are always discarded */ +#define CXL_CTRL_RESERVED_MASK (BIT(13) | BIT(15)) +/* bit 12 (P2P_Mem_Enable) treated as reserved if Cap3.P2P_Mem_Capable=0 */ +#define CXL_CTRL_P2P_REV_MASK CXL_DVSEC_CTRL_P2P_MEM_ENABLE + +/* DVSEC Status (0x0E): bits 13:0 and 15 are RsvdZ */ +#define CXL_STATUS_RESERVED_MASK (GENMASK(13, 0) | BIT(15)) + +/* + * DVSEC Control2 (0x10) emulation masks. + * + * CXL_CTRL2_HW_BITS_MASK: bits 1 (Initiate_Cache_WBI) and 2 + * (Initiate_CXL_Reset) always read 0 from hardware _ they are write-only + * action triggers per CXL 4.0 _8.1.3.8 Table 8-8. Forward these to the + * device to trigger the hardware action; clear them from vconfig shadow so + * that subsequent guest reads return 0 as hardware requires. + * + * NOTE: bit 0 (Disable_Caching) and bit 3 (CXL_Reset_Mem_Clr_Enable) are + * ordinary RW fields _ they must be preserved in vconfig, not forwarded. + */ +#define CXL_CTRL2_RESERVED_MASK GENMASK(15, 6) +#define CXL_CTRL2_HW_BITS_MASK (BIT(1) | BIT(2)) +/* bit 4 is RsvdP if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_CTRL2_VOLATILE_HDM_REV_MASK CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM +/* bit 5 is RsvdP if Cap2.Mod_Completion_Capable=0 */ +#define CXL_CTRL2_MODIFIED_COMP_REV_MASK CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE + +/* DVSEC Lock (0x14): bits 15:1 are RsvdP */ +#define CXL_LOCK_RESERVED_MASK GENMASK(15, 1) + +/* DVSEC Range Base Low: bits 27:0 are reserved per Tables 8-15/8-19 */ +#define CXL_BASE_LO_RESERVED_MASK CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK + +int vfio_cxl_setup_virt_regs(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl, + void __iomem *cap_base); +void vfio_cxl_clean_virt_regs(struct vfio_pci_cxl_state *cxl); +void vfio_cxl_reinit_comp_regs(struct vfio_pci_cxl_state *cxl); +resource_size_t +vfio_cxl_read_committed_decoder_size(struct vfio_pci_core_device *vdev, + struct vfio_pci_cxl_state *cxl); +int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, + resource_size_t size); +void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); + +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off); + +#endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99ec..6f880c247d61c 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info->size = hisi_acc_get_resource_len(vdev, info->index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1557,7 +1599,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1577,6 +1620,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc18..cd55eba64dfb2 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e5..9c5970411d07a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a9c54ff8498ab..69dd32632ee6f 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -489,35 +489,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static long -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -536,49 +526,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. - * Given that the memory as a BAR and may not be + * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -1002,6 +971,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -1022,6 +992,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f675..be103c74e9695 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2e..8fbdf7c6d666e 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2f..96d1f096eafa6 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -60,6 +60,12 @@ static bool disable_denylist; module_param(disable_denylist, bool, 0444); MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) +static bool disable_cxl; +module_param(disable_cxl, bool, 0444); +MODULE_PARM_DESC(disable_cxl, "Disable CXL Type-2 extensions for all devices bound to vfio-pci. Variant drivers may instead set vdev->disable_cxl in their probe for per-device control without needing this parameter."); +#endif + static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) { switch (pdev->vendor) { @@ -120,6 +126,29 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) } } + if (vdev->cxl) { + /* + * pci_config_map and vconfig are valid now (allocated by + * vfio_config_init() inside vfio_pci_core_enable() above). + */ + vfio_cxl_setup_dvsec_perms(vdev); + + ret = vfio_cxl_register_cxl_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to setup CXL region\n"); + vfio_pci_core_disable(vdev); + return ret; + } + + ret = vfio_cxl_register_comp_regs_region(vdev); + if (ret) { + pci_warn(pdev, "Failed to register COMP_REGS region\n"); + vfio_cxl_unregister_cxl_region(vdev); + vfio_pci_core_disable(vdev); + return ret; + } + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -132,6 +161,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -161,6 +191,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + vdev->disable_cxl = disable_cxl; +#endif ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 4abd4f2719958..548e9fbee5274 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -270,9 +270,9 @@ static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, } /* Raw access skips any kind of virtualization */ -static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 val) +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) { int ret; @@ -283,9 +283,9 @@ static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, return count; } -static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, - int count, struct perm_bits *perm, - int offset, __le32 *val) +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) { int ret; @@ -901,7 +901,9 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); } } @@ -983,7 +985,9 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_cxl_prepare_reset(vdev); pci_try_reset_function(vdev->pdev); + vfio_cxl_finish_reset(vdev); up_write(&vdev->memory_lock); } } @@ -1071,6 +1075,49 @@ static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) return 0; } +/* + * vfio_pci_dvsec_dispatch_read - per-device DVSEC read dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn at module init. + * Calls vdev->dvsec_readfn when a shadow-read handler has been registered + * (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), otherwise + * continue to vfio_raw_config_read for hardware pass-through. + * + * This indirection allows per-device DVSEC reads from vconfig shadow + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_read(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 *val) +{ + if (vdev->dvsec_readfn) + return vdev->dvsec_readfn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_read(vdev, pos, count, perm, offset, val); +} + +/* + * vfio_pci_dvsec_dispatch_write - per-device DVSEC write dispatcher. + * + * Installed as ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn at module init. + * Calls vdev->dvsec_writefn when a handler has been registered for this + * device (e.g. by vfio_cxl_setup_dvsec_perms() for CXL Type-2 devices), + * otherwise proceed to vfio_raw_config_write so that non-CXL devices + * with a DVSEC capability continue to pass writes to hardware. + * + * This indirection allows per-device DVSEC handlers to be registered + * without touching the global ecap_perms[] table. + */ +static int vfio_pci_dvsec_dispatch_write(struct vfio_pci_core_device *vdev, + int pos, int count, + struct perm_bits *perm, + int offset, __le32 val) +{ + if (vdev->dvsec_writefn) + return vdev->dvsec_writefn(vdev, pos, count, perm, offset, val); + return vfio_raw_config_write(vdev, pos, count, perm, offset, val); +} + /* * Initialize the shared permission tables */ @@ -1107,7 +1154,8 @@ int __init vfio_pci_init_perm_bits(void) ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; - ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_raw_config_write; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].readfn = vfio_pci_dvsec_dispatch_read; + ecap_perms[PCI_EXT_CAP_ID_DVSEC].writefn = vfio_pci_dvsec_dispatch_write; if (ret) vfio_pci_uninit_perm_bits(); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9339920793a7c..9e8febe33d2cc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -592,7 +592,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) struct pci_dev *pdev = vdev->pdev; struct vfio_pci_dummy_resource *dummy_res, *tmp; struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; - int i, bar; + int i, bar, bars; /* For needs_reset */ lockdep_assert_held(&vdev->vdev.dev_set->lock); @@ -651,8 +651,10 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) bar = i + PCI_STD_RESOURCES; if (!vdev->barmap[bar]) continue; + bars = (vdev->cxl && i == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); pci_iounmap(pdev, vdev->barmap[bar]); - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); vdev->barmap[bar] = NULL; } @@ -988,6 +990,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->cxl) { + ret = vfio_cxl_get_info(vdev, &caps); + if (ret) + return ret; + info.flags |= VFIO_DEVICE_FLAGS_CXL; + } + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; @@ -1024,42 +1033,42 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + if (vdev->cxl) { + ret = vfio_cxl_get_region_info(vdev, info, caps); + if (ret != -ENOTTY) + return ret; + } - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1071,9 +1080,9 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1083,16 +1092,17 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1101,10 +1111,10 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1113,53 +1123,36 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1244,6 +1237,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); + /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ + vfio_cxl_prepare_reset(vdev); + /* * This function can be invoked while the power state is non-D0. If * pci_try_reset_function() has been called while the power state is @@ -1256,6 +1252,13 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_try_reset_function(vdev->pdev); + + /* + * finish_reset checks the COMMITTED bit from hardware + * and only brings the region back if it is actually set. + */ + vfio_cxl_finish_reset(vdev); + up_write(&vdev->memory_lock); return ret; @@ -1485,8 +1488,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: @@ -1773,6 +1774,18 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma if (req_start + req_len > phys_len) return -EINVAL; + /* + * CXL devices: mmap is permitted for the GPU/accelerator register + * windows listed in the sparse-mmap capability. Block any request + * that overlaps the CXL component register block + * [comp_reg_offset, comp_reg_offset + comp_reg_size); those registers + * must be accessed exclusively through the COMP_REGS device region so + * that the emulation layer (notify_change) intercepts every write. + */ + if (vdev->cxl && index == vfio_cxl_get_component_reg_bar(vdev) && + vfio_cxl_mmap_overlaps_comp_regs(vdev, req_start, req_len)) + return -EINVAL; + /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. @@ -2192,6 +2205,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (ret) goto out_vf; + vfio_pci_cxl_detect_and_init(vdev); + vfio_pci_probe_power_state(vdev); /* @@ -2235,6 +2250,8 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vf_uninit(vdev); vfio_pci_vga_uninit(vdev); + vfio_pci_cxl_cleanup(vdev); + if (!disable_idle_d3) pm_runtime_get_noresume(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index c3eb839a3c705..625d42c4ea336 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -37,6 +37,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); + +int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -110,4 +118,66 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#if IS_ENABLED(CONFIG_VFIO_CXL_CORE) + +void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); +void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); +bool vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev); +void vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev); +void vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev); +void vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev); +int vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); +u8 vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev); +bool vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len); + +#else + +static inline void +vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline bool +vfio_cxl_reset_capable(struct vfio_pci_core_device *vdev) +{ return false; } +static inline void +vfio_cxl_prepare_reset(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_finish_reset(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_setup_dvsec_perms(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_cxl_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline void +vfio_cxl_unregister_cxl_region(struct vfio_pci_core_device *vdev) { } +static inline int +vfio_cxl_register_comp_regs_region(struct vfio_pci_core_device *vdev) +{ return 0; } +static inline int +vfio_cxl_get_info(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline int +vfio_cxl_get_region_info(struct vfio_pci_core_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ return -ENOTTY; } +static inline u8 +vfio_cxl_get_component_reg_bar(struct vfio_pci_core_device *vdev) +{ return U8_MAX; } +static inline bool +vfio_cxl_mmap_overlaps_comp_regs(struct vfio_pci_core_device *vdev, + u64 req_start, u64 req_len) +{ return false; } + +#endif /* CONFIG_VFIO_CXL_CORE */ + #endif diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 25380b7dfe18a..d816d06ca8c18 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -201,19 +201,29 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { struct pci_dev *pdev = vdev->pdev; - int ret; + int ret, bars; void __iomem *io; if (vdev->barmap[bar]) return 0; - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + /* + * The CXL component register BAR cannot be claimed exclusively: the + * CXL subsystem holds persistent sub-range iomem claims during HDM + * decoder setup. pci_request_selected_regions() for the full BAR + * fails with EBUSY. Pass bars=0 to make the request a no-op and map + * directly via pci_iomap(). + */ + bars = (vdev->cxl && bar == vfio_cxl_get_component_reg_bar(vdev)) ? + 0 : (1 << bar); + + ret = pci_request_selected_regions(pdev, bars, "vfio"); if (ret) return ret; io = pci_iomap(pdev, bar, 0); if (!io) { - pci_release_selected_regions(pdev, 1 << bar); + pci_release_selected_regions(pdev, bars); return -ENOMEM; } diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386e..cb3d5e57d3a3e 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,9 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c4..1ed349a556291 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,41 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; - struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f7..d2e5cbca13c85 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -108,7 +109,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -130,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index ff8ff8480968c..a28be7d11ce03 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -113,6 +113,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7f..a4d3ace3e02dd 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd7957..c2990b7e900fa 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,24 @@ int vfio_platform_open_device(struct vfio_device *core_vdev) } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + if (info->index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +318,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab5168490..05084212a76eb 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,9 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 715368076a1fe..f5c8939905252 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1258,6 +1258,51 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1295,6 +1340,10 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + ret = vfio_get_region_info(device, uptr); + break; + default: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h index 7d0b09ff57681..687329b18ae62 100644 --- a/include/cxl/cxl.h +++ b/include/cxl/cxl.h @@ -10,6 +10,7 @@ #include #include #include +#include /** * enum cxl_devtype - delineate type-2 from a generic type-3 device @@ -76,48 +77,6 @@ struct cxl_regs { #define CXL_CM_CAP_CAP_ID_HDM 0x5 #define CXL_CM_CAP_CAP_HDM_VERSION 1 -/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ -#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K - -/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers */ -#define CXL_CM_OFFSET 0x1000 -#define CXL_CM_CAP_HDR_OFFSET 0x0 -#define CXL_CM_CAP_HDR_ID_MASK GENMASK(15, 0) -#define CM_CAP_HDR_CAP_ID 1 -#define CXL_CM_CAP_HDR_VERSION_MASK GENMASK(19, 16) -#define CM_CAP_HDR_CAP_VERSION 1 -#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK GENMASK(23, 20) -#define CM_CAP_HDR_CACHE_MEM_VERSION 1 -#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24) -#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20) - -/* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */ -#define CXL_HDM_DECODER_CAP_OFFSET 0x0 -#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) -#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) -#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11) -#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12) -#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 -#define CXL_HDM_DECODER_ENABLE BIT(1) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) -#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) -#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) -#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) -#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) -#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) -#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) -#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12) -#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) -#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) -#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) -#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) - /* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */ #define CXL_DECODER_MIN_GRANULARITY 256 #define CXL_DECODER_MAX_ENCODED_IG 6 @@ -134,6 +93,7 @@ struct cxl_reg_map { int id; unsigned long offset; unsigned long size; + u8 count; }; struct cxl_component_reg_map { @@ -158,9 +118,16 @@ struct cxl_pmu_reg_map { * @resource: physical resource base of the register block * @max_size: maximum mapping size to perform register search * @reg_type: see enum cxl_regloc_type + * @bar_index: PCI BAR index (0-5) when regblock is BAR-backed; 0xFF otherwise + * @bar_offset: offset within the BAR; only valid when bar_index <= 5 * @component_map: cxl_reg_map for component registers * @device_map: cxl_reg_maps for device registers * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units + * + * When the register block is described by the Register Locator DVSEC with + * a BAR Indicator (BIR 0-5), bar_index and bar_offset are set so callers can + * use pci_iomap(pdev, bar_index, size) and base + bar_offset instead of + * ioremap(resource). */ struct cxl_register_map { struct device *host; @@ -168,6 +135,8 @@ struct cxl_register_map { resource_size_t resource; resource_size_t max_size; u8 reg_type; + u8 bar_index; + resource_size_t bar_offset; union { struct cxl_component_reg_map component_map; struct cxl_device_reg_map device_map; @@ -338,4 +307,19 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); struct cxl_region *cxl_create_region(struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder **cxled, int ways); + +#ifdef CONFIG_CXL_BUS + +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size); + +#else + +static inline +int cxl_get_hdm_info(struct cxl_dev_state *cxlds, u8 *count, + resource_size_t *offset, resource_size_t *size) +{ return -EOPNOTSUPP; } + +#endif /* CONFIG_CXL_BUS */ + #endif /* __CXL_CXL_H__ */ diff --git a/include/cxl/pci.h b/include/cxl/pci.h index edbf980c283f1..dd1136be250d0 100644 --- a/include/cxl/pci.h +++ b/include/cxl/pci.h @@ -16,10 +16,19 @@ enum cxl_regloc_type { struct pci_dev; struct cxl_register_map; +struct cxl_component_reg_map; +struct cxl_dev_state; int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map); +int cxl_await_range_active(struct cxl_dev_state *cxlds); +int cxl_regblock_get_bar_info(const struct cxl_register_map *map, u8 *bar_index, + resource_size_t *bar_offset); +int cxl_dev_reset(struct pci_dev *pdev, int dvsec, bool mem_clr_en); +bool pci_cxl_reset_capable(struct pci_dev *pdev); int cxl_setup_regs(struct cxl_register_map *map); #endif diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 0c4c84b8c3be9..182fd7e70f92a 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee5..8e1ddb48b9b54 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -132,6 +133,9 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 46ab6075ab75a..39b2b354143af 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -27,6 +27,8 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct vfio_pci_cxl_state; +struct perm_bits; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -86,6 +88,7 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; + bool disable_cxl:1; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; @@ -96,6 +99,13 @@ struct vfio_pci_core_device { struct mutex ioeventfds_lock; struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; + struct vfio_pci_cxl_state *cxl; + int (*dvsec_readfn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val); + int (*dvsec_writefn)(struct vfio_pci_core_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val); struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; @@ -128,6 +138,9 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, diff --git a/include/uapi/cxl/cxl_regs.h b/include/uapi/cxl/cxl_regs.h new file mode 100644 index 0000000000000..e9746e75e09ae --- /dev/null +++ b/include/uapi/cxl/cxl_regs.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * CXL Standard defines + * + * Hardware register offsets and bit-field masks for the CXL Component + * Register block, as defined by the CXL Specification r4.0. + */ + +#ifndef _UAPI_CXL_REGS_H_ +#define _UAPI_CXL_REGS_H_ + +#include /* _BITUL(), _BITULL() */ +#include /* __GENMASK() */ + +/* CXL 4.0 8.2.3 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE 0x00010000 + +/* CXL 4.0 8.2.4 CXL.cache and CXL.mem Registers*/ +#define CXL_CM_OFFSET 0x1000 +#define CXL_CM_CAP_HDR_OFFSET 0x0 +#define CXL_CM_CAP_HDR_ID_MASK __GENMASK(15, 0) +#define CM_CAP_HDR_CAP_ID 1 +#define CXL_CM_CAP_HDR_VERSION_MASK __GENMASK(19, 16) +#define CM_CAP_HDR_CAP_VERSION 1 +#define CXL_CM_CAP_HDR_CACHE_MEM_VERSION_MASK __GENMASK(23, 20) +#define CM_CAP_HDR_CACHE_MEM_VERSION 1 +#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK __GENMASK(31, 24) +#define CXL_CM_CAP_PTR_MASK __GENMASK(31, 20) + +/* HDM decoders CXL 4.0 8.2.4.20 CXL HDM Decoder Capability Structure */ +#define CXL_HDM_DECODER_CAP_OFFSET 0x0 +#define CXL_HDM_DECODER_COUNT_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER_TARGET_COUNT_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER_INTERLEAVE_11_8 _BITUL(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 _BITUL(9) +#define CXL_HDM_DECODER_POISON_ON_DECODE_ERR _BITUL(10) +#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY _BITUL(11) +#define CXL_HDM_DECODER_INTERLEAVE_16_WAY _BITUL(12) +#define CXL_HDM_DECODER_UIO_CAPABLE _BITUL(13) +#define CXL_HDM_DECODER_UIO_COUNT_MASK __GENMASK(19, 16) +#define CXL_HDM_DECODER_MEMDATA_NXM _BITUL(20) +#define CXL_HDM_DECODER_COHERENCY_MODELS_MASK __GENMASK(22, 21) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE _BITUL(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK __GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK __GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK _BITUL(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT _BITUL(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED _BITUL(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR _BITUL(11) +#define CXL_HDM_DECODER0_CTRL_HOSTONLY _BITUL(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) + +/* + * CXL r4.0 8.1.3: DVSEC for CXL Devices + * + * Register offsets are relative to the DVSEC capability base address, + * as discovered via PCI_EXT_CAP_ID_DVSEC with DVSEC ID 0x0. + * All registers in this section are 16-bit wide. + */ + +/* DVSEC register offsets */ +#define CXL_DVSEC_CAPABILITY_OFFSET 0x0a +#define CXL_DVSEC_CONTROL_OFFSET 0x0c +#define CXL_DVSEC_STATUS_OFFSET 0x0e +#define CXL_DVSEC_CONTROL2_OFFSET 0x10 +#define CXL_DVSEC_STATUS2_OFFSET 0x12 +#define CXL_DVSEC_LOCK_OFFSET 0x14 +#define CXL_DVSEC_CAPABILITY2_OFFSET 0x16 +#define CXL_DVSEC_RANGE1_SIZE_HIGH_OFFSET 0x18 +#define CXL_DVSEC_RANGE1_SIZE_LOW_OFFSET 0x1c +#define CXL_DVSEC_RANGE1_BASE_HIGH_OFFSET 0x20 +#define CXL_DVSEC_RANGE1_BASE_LOW_OFFSET 0x24 +#define CXL_DVSEC_RANGE2_SIZE_HIGH_OFFSET 0x28 +#define CXL_DVSEC_RANGE2_SIZE_LOW_OFFSET 0x2c +#define CXL_DVSEC_RANGE2_BASE_HIGH_OFFSET 0x30 +#define CXL_DVSEC_RANGE2_BASE_LOW_OFFSET 0x34 +#define CXL_DVSEC_CAPABILITY3_OFFSET 0x38 + +/* DVSEC Range Base Low registers: bits [27:0] are reserved */ +#define CXL_DVSEC_RANGE_BASE_LOW_RSVD_MASK __GENMASK(27, 0) + +/* CXL r4.0 8.1.3.1 Table 8-5 DVSEC CXL Capability (offset 0x0A) */ +#define CXL_DVSEC_CAP_CACHE_CAPABLE _BITUL(0) +#define CXL_DVSEC_CAP_IO_CAPABLE _BITUL(1) +#define CXL_DVSEC_CAP_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_CAP_MEM_HW_INIT_MODE _BITUL(3) +#define CXL_DVSEC_CAP_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP_CACHE_WBI_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP_CXL_RESET_CAPABLE _BITUL(7) +#define CXL_DVSEC_CAP_CXL_RESET_TIMEOUT_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CAP_CXL_RESET_MEM_CLR_CAPABLE _BITUL(11) +#define CXL_DVSEC_CAP_TSP_CAPABLE _BITUL(12) +#define CXL_DVSEC_CAP_MLD_CAPABLE _BITUL(13) +#define CXL_DVSEC_CAP_VIRAL_CAPABLE _BITUL(14) +#define CXL_DVSEC_CAP_PM_INIT_REPORTING_CAPABLE _BITUL(15) + +/* CXL r4.0 8.1.3.2 Table 8-6 DVSEC CXL Control (offset 0x0C) */ +#define CXL_DVSEC_CTRL_CACHE_ENABLE _BITUL(0) +#define CXL_DVSEC_CTRL_IO_ENABLE _BITUL(1) +#define CXL_DVSEC_CTRL_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_CTRL_CACHE_SF_COVERAGE_MASK __GENMASK(7, 3) +#define CXL_DVSEC_CTRL_CACHE_SF_GRANULARITY_MASK __GENMASK(10, 8) +#define CXL_DVSEC_CTRL_CACHE_CLEAN_EVICTION _BITUL(11) +#define CXL_DVSEC_CTRL_P2P_MEM_ENABLE _BITUL(12) +/* bit 13: RsvdP */ +#define CXL_DVSEC_CTRL_VIRAL_ENABLE _BITUL(14) +/* bit 15: RsvdP */ + +/* CXL r4.0 8.1.3.3 Table 8-7 DVSEC CXL Status (offset 0x0E) */ +/* bits 13:0 = RsvdZ */ +#define CXL_DVSEC_STATUS_VIRAL_STATUS _BITUL(14) +/* bit 15 = RsvdZ */ + +/* CXL r4.0 8.1.3.4 Table 8-8 DVSEC CXL Control2 (offset 0x10) */ +#define CXL_DVSEC_CTRL2_DISABLE_CACHING _BITUL(0) +#define CXL_DVSEC_CTRL2_INITIATE_CACHE_WBI _BITUL(1) +#define CXL_DVSEC_CTRL2_INITIATE_CXL_RESET _BITUL(2) +#define CXL_DVSEC_CTRL2_CXL_RESET_MEM_CLR_ENABLE _BITUL(3) +#define CXL_DVSEC_CTRL2_DESIRED_VOLATILE_HDM _BITUL(4) +#define CXL_DVSEC_CTRL2_MOD_COMPLETION_ENABLE _BITUL(5) +/* bits 15:6 = RsvdP */ + +/* CXL r4.0 8.1.3.5 Table 8-9 DVSEC CXL Status2 (offset 0x12) */ +#define CXL_DVSEC_STATUS2_CACHE_INVALID _BITUL(0) +#define CXL_DVSEC_STATUS2_CXL_RESET_COMPLETE _BITUL(1) +#define CXL_DVSEC_STATUS2_CXL_RESET_ERROR _BITUL(2) +/* RW1CS; RsvdZ if Cap3.Volatile_HDM_Configurability=0 */ +#define CXL_DVSEC_STATUS2_VOLATILE_HDM_PRES_ERROR _BITUL(3) +/* bits 14:4 = RsvdZ */ +#define CXL_DVSEC_STATUS2_PM_INIT_COMPLETION _BITUL(15) + +/* CXL r4.0 _8.1.3.6 Table 8-10 _ DVSEC CXL Lock (offset 0x14) */ +#define CXL_DVSEC_LOCK_CONFIG_LOCK _BITUL(0) +/* bits 15:1 = RsvdP */ + +/* CXL r4.0 8.1.3.7 Table 8-11 DVSEC CXL Capability2 (offset 0x16) */ +#define CXL_DVSEC_CAP2_CACHE_SIZE_UNIT_MASK __GENMASK(3, 0) +#define CXL_DVSEC_CAP2_FALLBACK_CAPABILITY_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CAP2_MOD_COMPLETION_CAPABLE _BITUL(6) +#define CXL_DVSEC_CAP2_NO_CLEAN_WRITEBACK _BITUL(7) +#define CXL_DVSEC_CAP2_CACHE_SIZE_MASK __GENMASK(15, 8) + +/* CXL r4.0 8.1.3.14 Table 8-20 DVSEC CXL Capability3 (offset 0x38) */ +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_COLD_RESET _BITUL(0) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_WARM_RESET _BITUL(1) +#define CXL_DVSEC_CAP3_DEFAULT_VOLATILE_HDM_HOT_RESET _BITUL(2) +#define CXL_DVSEC_CAP3_VOLATILE_HDM_CONFIGURABILITY _BITUL(3) +#define CXL_DVSEC_CAP3_P2P_MEM_CAPABLE _BITUL(4) +/* bits 15:5 = RsvdP */ + +#endif /* _UAPI_CXL_REGS_H_ */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009baf..8394efb153e75 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -214,6 +214,16 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ #define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ +/* + * Vendor-specific CXL device with CXL.mem capability (HDM-D or HDM-DB + * decoder, PCI class code != PCI_CLASS_MEMORY_CXL). Covers CXL Type-2 + * accelerators and non-class-code Type-3 variants. When set, + * VFIO_DEVICE_FLAGS_PCI is also set (same device is a PCI device). The + * capability chain (VFIO_DEVICE_FLAGS_CAPS) contains VFIO_DEVICE_INFO_CAP_CXL + * describing HDM decoders, region indices, decoder layout, and CXL-specific + * options. + */ +#define VFIO_DEVICE_FLAGS_CXL (1 << 9) /* Device supports CXL */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ @@ -256,6 +266,70 @@ struct vfio_device_info_cap_pci_atomic_comp { __u32 reserved; }; +/* + * VFIO_DEVICE_INFO_CAP_CXL - CXL Type-2 device capability + * + * Present in the device info capability chain when VFIO_DEVICE_FLAGS_CXL + * is set. Describes Host Managed Device Memory (HDM) layout and CXL + * memory options so that userspace (e.g. QEMU) can expose the CXL region + * and component registers correctly to the guest. + * + * The HDM decoder count and HDM decoder block offset within the COMP_REGS + * region are derivable from the COMP_REGS region itself. + * + * To find the HDM decoder block offset (hdm_decoder_offset), traverse the CXL + * Capability Array starting at COMP_REGS region offset 0: + * - Dword 0 bits[31:24] (CXL_CM_CAP_HDR_ARRAY_SIZE_MASK): number of + * capability entries. + * - Each subsequent dword at offset (cap * 4): bits[15:0] = cap ID + * (CXL_CM_CAP_HDR_ID_MASK), bits[31:20] = byte offset from COMP_REGS + * start to that capability's register block (CXL_CM_CAP_PTR_MASK). + * - Locate the entry with cap ID == CXL_CM_CAP_CAP_ID_HDM (0x5); the + * extracted bits[31:20] value is directly the byte offset + * hdm_decoder_offset (no further scaling required). + * + * To find the HDM decoder count, pread the HDM Decoder Capability register + * at hdm_decoder_offset + CXL_HDM_DECODER_CAP_OFFSET within the + * COMP_REGS region; bits[3:0] (CXL_HDM_DECODER_COUNT_MASK) encode the count + * using the formula: count = (field == 0) ? 1 : field * 2. + */ +#define VFIO_DEVICE_INFO_CAP_CXL 6 +struct vfio_device_info_cap_cxl { + struct vfio_info_cap_header header; + __u8 hdm_regs_bar_index; /* PCI BAR containing HDM registers */ + __u8 reserved[3]; + __u32 flags; +/* Decoder was committed by host firmware/BIOS */ +#define VFIO_CXL_CAP_FIRMWARE_COMMITTED (1 << 0) +/* + * Device implements an HDM-DB decoder (CXL.cache + CXL.mem). Reflects + * the Cache_Capable bit (bit 0) in the CXL DVSEC Capability register. + * + * When clear: HDM-D decoder (CXL.mem only, no CXL.cache). FLR does not + * require a Write-Back Invalidation (WBI) sequence; the device holds no + * coherent copies of host memory. + * + * When set: HDM-DB decoder (CXL 3.0+). The kernel driver does not + * perform Write-Back Invalidation (WBI) automatically. The VMM must + * issue a WBI sequence before asserting FLR to flush dirty device cache + * lines and prevent coherency violations, and should advertise + * Back-Invalidation support in the virtual CXL topology. + */ +#define VFIO_CXL_CAP_CACHE_CAPABLE (1 << 1) + /* + * Byte offset within the BAR to the CXL.mem register area start + * (= comp_reg_offset + CXL_CM_OFFSET). This is where the CXL + * Capability Array Header lives. + */ + __u64 hdm_regs_offset; + /* + * Region indices for the two CXL VFIO device regions. + * Avoids forcing userspace to scan all regions by type/subtype. + */ + __u32 dpa_region_index; /* VFIO_REGION_SUBTYPE_CXL */ + __u32 comp_regs_region_index; /* VFIO_REGION_SUBTYPE_CXL_COMP_REGS */ +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -369,6 +443,18 @@ struct vfio_region_info_cap_type { */ #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) +/* 1e98 vendor PCI sub-types (CXL Consortium) */ +/* + * CXL memory region. Use with region type + * (PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE). + * DPA memory region (fault+zap mmap) + */ +#define VFIO_REGION_SUBTYPE_CXL (1) +/* + * HDM decoder register emulation region (read/write only, no mmap). + */ +#define VFIO_REGION_SUBTYPE_CXL_COMP_REGS (2) + /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 18623ba666e33..64ea19253ee3a 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -143,11 +143,6 @@ static struct mdev_parent mbochs_parent; static atomic_t mbochs_avail_mbytes; static const struct vfio_device_ops mbochs_dev_ops; -struct vfio_region_info_ext { - struct vfio_region_info base; - struct vfio_region_info_cap_type type; -}; - struct mbochs_mode { u32 drm_format; u32 bytepp; @@ -1033,10 +1028,12 @@ static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) return 0; } -static int mbochs_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info_ext *ext) +static int mbochs_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { - struct vfio_region_info *region_info = &ext->base; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); if (region_info->index >= MBOCHS_NUM_REGIONS) return -EINVAL; @@ -1061,20 +1058,23 @@ static int mbochs_get_region_info(struct mdev_state *mdev_state, region_info->flags = (VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE); break; - case MBOCHS_EDID_REGION_INDEX: - ext->base.argsz = sizeof(*ext); - ext->base.offset = MBOCHS_EDID_OFFSET; - ext->base.size = MBOCHS_EDID_SIZE; - ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_CAPS); - ext->base.cap_offset = offsetof(typeof(*ext), type); - ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; - ext->type.header.version = 1; - ext->type.header.next = 0; - ext->type.type = VFIO_REGION_TYPE_GFX; - ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; - break; + case MBOCHS_EDID_REGION_INDEX: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1, + .type = VFIO_REGION_TYPE_GFX, + .subtype = VFIO_REGION_SUBTYPE_GFX_EDID, + }; + + region_info->offset = MBOCHS_EDID_OFFSET; + region_info->size = MBOCHS_EDID_SIZE; + region_info->flags = (VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_CAPS); + + return vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + } default: region_info->size = 0; region_info->offset = 0; @@ -1191,7 +1191,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); int ret = 0; - unsigned long minsz, outsz; + unsigned long minsz; switch (cmd) { case VFIO_DEVICE_GET_INFO: @@ -1215,30 +1215,6 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info_ext info; - - minsz = offsetofend(typeof(info), base.offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - outsz = info.base.argsz; - if (outsz < minsz) - return -EINVAL; - if (outsz > sizeof(info)) - return -EINVAL; - - ret = mbochs_get_region_info(mdev_state, &info); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, outsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1376,6 +1352,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, + .get_region_info_caps = mbochs_ioctl_get_region_info, .mmap = mbochs_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 8104831ae125b..0759bd68edca0 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -435,10 +435,13 @@ static int mdpy_mmap(struct vfio_device *vdev, struct vm_area_struct *vma) return remap_vmalloc_range(vma, mdev_state->memblk, 0); } -static int mdpy_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mdpy_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + if (region_info->index >= VFIO_PCI_NUM_REGIONS && region_info->index != MDPY_DISPLAY_REGION) return -EINVAL; @@ -544,30 +547,6 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mdpy_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -665,6 +644,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, + .get_region_info_caps = mdpy_ioctl_get_region_info, .mmap = mdpy_mmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 59eefe2fed10c..3e029d0cba1ea 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1717,10 +1717,12 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags, return ret; } -static int mtty_get_region_info(struct mdev_state *mdev_state, - struct vfio_region_info *region_info, - u16 *cap_type_id, void **cap_type) +static int mtty_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *region_info, + struct vfio_info_cap *caps) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); unsigned int size = 0; u32 bar_index; @@ -1817,30 +1819,6 @@ static long mtty_ioctl(struct vfio_device *vdev, unsigned int cmd, return 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = mtty_get_region_info(mdev_state, &info, &cap_type_id, - &cap_type); - if (ret) - return ret; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { @@ -1949,6 +1927,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, + .get_region_info_caps = mtty_ioctl_get_region_info, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas,