diff --git a/Kbuild b/Kbuild index 94f4589..91ad5c4 100644 --- a/Kbuild +++ b/Kbuild @@ -13,6 +13,7 @@ neuron-objs += neuron_fw_io.o neuron-objs += neuron_dmabuf.o neuron-objs += neuron_log.o neuron-objs += neuron_power.o +neuron-objs += neuron_test.o neuron-objs += vc/neuron_dhal_vc.o neuron-objs += v2/notific.o v2/neuron_dhal_v2.o neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o diff --git a/Kbuild.tpl b/Kbuild.tpl new file mode 100644 index 0000000..992b47c --- /dev/null +++ b/Kbuild.tpl @@ -0,0 +1,23 @@ +obj-m += neuron.o + +neuron-objs := neuron_arch.o neuron_dhal.o +neuron-objs += neuron_reg_access.o +neuron-objs += neuron_module.o neuron_pci.o neuron_mempool.o neuron_dma.o neuron_ring.o neuron_ds.o +neuron-objs += neuron_core.o neuron_crwl.o neuron_cdev.o neuron_topsp.o neuron_pid.o +neuron-objs += neuron_reset.o neuron_cinit.o neuron_mmap.o neuron_p2p.o +neuron-objs += neuron_nq.o +neuron-objs += neuron_mc_handle.o +neuron-objs += neuron_metrics.o neuron_sysfs_metrics.o +neuron-objs += udma/udma_iofic.o udma/udma_m2m.o udma/udma_main.o +neuron-objs += neuron_fw_io.o +neuron-objs += neuron_dmabuf.o +neuron-objs += neuron_log.o +neuron-objs += neuron_power.o +neuron-objs += vc/neuron_dhal_vc.o +neuron-objs += v2/notific.o v2/neuron_dhal_v2.o +neuron-objs += v3/notific.o v3/neuron_dhal_v3.o v3/neuron_pelect.o +neuron-objs += v4/neuron_dhal_v4.o +{extra-objs} +ccflags-y += -O3 -Wall -Werror -Wno-declaration-after-statement -Wunused-macros -Wunused-local-typedefs +ccflags-y += -I$(src)/ +ccflags-y += $(call cc-option,-march=armv8.2-a) diff --git a/aws-neuronx-dkms-mkdeb/debian/postinst b/aws-neuronx-dkms-mkdeb/debian/postinst index c4ad0da..e189062 100755 --- a/aws-neuronx-dkms-mkdeb/debian/postinst +++ b/aws-neuronx-dkms-mkdeb/debian/postinst @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/sh # Copyright (C) 2002-2005 Flavio Stanchina # Copyright (C) 2005-2006 Aric Cyr # Copyright (C) 2007 Mario Limonciello diff --git a/aws-neuronx-dkms-mkdeb/debian/prerm b/aws-neuronx-dkms-mkdeb/debian/prerm index 654f5fc..61b816d 100755 --- a/aws-neuronx-dkms-mkdeb/debian/prerm +++ b/aws-neuronx-dkms-mkdeb/debian/prerm @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/sh NAME=MODULE_NAME VERSION=MODULE_VERSION diff --git a/aws-neuronx-dkms-mkrpm.spec b/aws-neuronx-dkms-mkrpm.spec index a01839a..c9df3b0 100644 --- a/aws-neuronx-dkms-mkrpm.spec +++ b/aws-neuronx-dkms-mkrpm.spec @@ -7,6 +7,10 @@ %{?!_srcdir: %define _srcdir %_prefix/src} %{?!_datarootdir: %define _datarootdir %{_datadir}} +%define _source_filedigest_algorithm 10 +%define _binary_filedigest_algorithm 10 +%define _binary_payload w7.xzdio + Summary: %{module_name} %{version} dkms package Name: aws-neuronx-dkms Version: %{version} @@ -84,6 +88,9 @@ exit 0 %files %defattr(-,root,root) %{_srcdir} +%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/preinstall +%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/postinstall +%attr(755,root,root) %{_srcdir}/%{module_name}-%{version}/postremove %{_datarootdir}/%{module_name}/ %changelog diff --git a/aws-neuronx-dkms-mkrpm/post_install.sh b/aws-neuronx-dkms-mkrpm/post_install.sh new file mode 100644 index 0000000..af4f8f9 --- /dev/null +++ b/aws-neuronx-dkms-mkrpm/post_install.sh @@ -0,0 +1,12 @@ +for POSTINST in /usr/lib/dkms/common.postinst /usr/share/MODULE_NAME/postinst; do + if [ -f $POSTINST ]; then + $POSTINST MODULE_NAME MODULE_VERSION /usr/share/MODULE_NAME + exit $? + fi + echo "WARNING: $POSTINST does not exist." +done +echo -e "ERROR: DKMS version is too old and MODULE_NAME was not" +echo -e "built with legacy DKMS support." +echo -e "You must either rebuild MODULE_NAME with legacy postinst" +echo -e "support or upgrade DKMS to a more current version." +exit 1 diff --git a/aws-neuronx-dkms-mkrpm/pre_uninstall.sh b/aws-neuronx-dkms-mkrpm/pre_uninstall.sh new file mode 100644 index 0000000..8e171a9 --- /dev/null +++ b/aws-neuronx-dkms-mkrpm/pre_uninstall.sh @@ -0,0 +1,12 @@ +echo -e +echo -e "Uninstall of MODULE_NAME module (version MODULE_VERSION) beginning:" +if lsmod | grep -q "^neuron "; then + echo "Neuron module is currently loaded. Attempting to unload..." + if ! rmmod neuron 2>/dev/null; then + echo "ERROR: Cannot unload neuron module - it is currently in use." + echo "Please stop all processes using the neuron module before uninstalling." + exit 1 + fi +fi +dkms remove -m MODULE_NAME -v MODULE_VERSION --all --rpm_safe_upgrade +exit 0 diff --git a/dkms.conf b/dkms.conf index ea60afc..f5b504e 100644 --- a/dkms.conf +++ b/dkms.conf @@ -1,5 +1,5 @@ PACKAGE_NAME=aws-neuronx -PACKAGE_VERSION=2.27.4.0 +PACKAGE_VERSION=2.28.0.0 BUILT_MODULE_NAME[0]="neuron" MAKE[0]="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build" CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean" diff --git a/neuron_arch.h b/neuron_arch.h index c27e5b9..831679d 100644 --- a/neuron_arch.h +++ b/neuron_arch.h @@ -24,6 +24,11 @@ enum neuron_platform_type { NEURON_PLATFORM_TYPE_INVALID, }; +enum neuron_platform_operation_type { + NEURON_PLATFORM_OP_TYPE_DEVOPEN = 0, + NEURON_PLATFORM_OP_TYPE_EXEC = 1, +}; + /** * narch_init() - Set neuron devices architecture and revision. * diff --git a/neuron_cdev.c b/neuron_cdev.c index 4dfbfe7..dbc12af 100644 --- a/neuron_cdev.c +++ b/neuron_cdev.c @@ -149,7 +149,7 @@ static int ncdev_dma_queue_init(struct neuron_device *nd, void *param) else rxc_mc = NULL; ret = ndmar_queue_init(nd, arg.eng_id, arg.qid, arg.tx_desc_count, arg.rx_desc_count, tx_mc, - rx_mc, rxc_mc, arg.axi_port, false); + rx_mc, rxc_mc, false); return ret; } @@ -173,7 +173,7 @@ static int ncdev_dma_queue_init_batch_entry(struct neuron_device *nd, struct neu else rxc_mc = NULL; ret = ndmar_queue_init(nd, arg->eng_id, arg->qid, arg->tx_desc_count, arg->rx_desc_count, tx_mc, - rx_mc, rxc_mc, arg->axi_port, false); + rx_mc, rxc_mc, false); return ret; } @@ -458,6 +458,7 @@ static int ncdev_mem_alloc_libnrt(struct neuron_device *nd, unsigned int cmd, vo static_assert(NEURON_IOCTL_MEM_ALLOC_V2 != NEURON_IOCTL_MEM_ALLOC_V2MT); static_assert(NEURON_IOCTL_MEM_ALLOC_V2 != NEURON_IOCTL_MEM_ALLOC_V2MT64); static_assert(NEURON_IOCTL_MEM_ALLOC_V2MT != NEURON_IOCTL_MEM_ALLOC_V2MT64); + static_assert(NEURON_IOCTL_MEM_ALLOC_V2MT64 != NEURON_IOCTL_MEM_ALLOC_V2MT64_PA); enum mem_location location; u64 mh; @@ -522,6 +523,53 @@ static int ncdev_mem_alloc_libnrt(struct neuron_device *nd, unsigned int cmd, vo nc_id = mem_alloc_arg.nc_id; mem_type = mem_alloc_arg.mem_type; mem_handle = mem_alloc_arg.mem_handle; + } else if (cmd == NEURON_IOCTL_MEM_ALLOC_V2MT64_PA) { + /* Extended alloc that also returns PA in a single ioctl */ + struct neuron_ioctl_mem_alloc_v2_mem_type64_pa mem_alloc_arg; + ret = neuron_copy_from_user(__func__, &mem_alloc_arg, (struct neuron_ioctl_mem_alloc_v2_mem_type64_pa *)param, + sizeof(mem_alloc_arg)); + if (ret) + return ret; + + size = mem_alloc_arg.size; + align = mem_alloc_arg.align; + host_memory = mem_alloc_arg.host_memory; + dram_channel = mem_alloc_arg.dram_channel; + dram_region = mem_alloc_arg.dram_region; + nc_id = mem_alloc_arg.nc_id; + mem_type = mem_alloc_arg.mem_type; + mem_handle = mem_alloc_arg.mem_handle; + + if (host_memory) + location = MEM_LOC_HOST; + else + location = MEM_LOC_DEVICE; + ret = mc_alloc_align(nd, MC_LIFESPAN_CUR_PROCESS, size, align, location, dram_channel, dram_region, nc_id, mem_type, &mc); + if (ret) + return ret; + + trace_ioctl_mem_alloc(nd, mc); + + ret = ncdev_mem_chunk_to_mem_handle(nd, mc, &mh); + if (!ret) + ret = copy_to_user(mem_handle, &mh, sizeof(mc)); + if (ret) { + mc_free(&mc); + return ret; + } + + /* Fill in PA and copy the struct back to userspace */ + if (mc->mem_location == MEM_LOC_HOST) + mem_alloc_arg.pa = mc->pa | ndhal->ndhal_address_map.pci_host_base; + else + mem_alloc_arg.pa = mc->pa; + + ret = copy_to_user(param, &mem_alloc_arg, sizeof(mem_alloc_arg)); + if (ret) { + mc_free(&mc); + return ret; + } + return 0; } else { return -EINVAL; } @@ -1204,7 +1252,12 @@ static int ncdev_mem_buf_zerocopy64(struct neuron_device *nd, unsigned int cmd, // simulation does not have bar4 mapped to the actual memory, don't do it if (use_bar4_wr) { u64 cpy_offset; - ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset); + ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + offset, size, &cpy_offset); + if (unlikely(ret)) { + pr_err("Failed to map address 0x%llx to BAR4\n", mc->pa + offset); + return ret; + } + // copy from user is slow, try fast copy and fall back if fails pagefault_disable(); ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, buffer, size); @@ -1378,7 +1431,11 @@ static int ncdev_mem_buf_zerocopy64_batch(struct neuron_device *nd, void *param) const nrt_tensor_batch_op_t op = batch->ops_ptr[j]; u64 cpy_offset = 0; - ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset); + ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(mc->pa + op.offset, op.size, &cpy_offset); + if (unlikely(ret)) { + pr_err("Failed to map address 0x%llx to BAR4\n", mc->pa + op.offset); + goto cleanup; + } // copy from user is slow, try fast copy and fall back if fails pagefault_disable(); ret = __copy_from_user_inatomic(nd->npdev.bar4 + cpy_offset, op.buffer, op.size); @@ -1732,7 +1789,6 @@ static long ncdev_nc_reset(struct neuron_device *nd, void *param) if (ret) return ret; - ndmar_close_ncs(nd, arg.nc_map); arg.request_id = task_tgid_nr(current); ret = nr_start_ncs(nd, arg.nc_map, arg.request_id); if (ret) { @@ -1743,7 +1799,6 @@ static long ncdev_nc_reset(struct neuron_device *nd, void *param) static long ncdev_device_reset_deprecated(struct neuron_device *nd) { - ndmar_close(nd); nr_start(nd); return 0; } @@ -1913,7 +1968,8 @@ static long ncdev_driver_info(unsigned int cmd, void *param) NEURON_DRIVER_FEATURE_BATCH_DMAQ_INIT | NEURON_DRIVER_FEATURE_BIG_CORE_MAPS | NEURON_DRIVER_FEATURE_MEM_ALLOC_TYPE | NEURON_DRIVER_FEATURE_HBM_SCRUB | NEURON_DRIVER_FEATURE_MEM_ALLOC64 | NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD | - NEURON_DRIVER_FEATURE_ZEROCOPY; + NEURON_DRIVER_FEATURE_ZEROCOPY | NEURON_DRIVER_FEATURE_PINNED_HOST_MEM | + NEURON_DRIVER_FEATURE_ALLOC_WITH_PA; return copy_to_user(param, &driver_info, sizeof(driver_info)); } @@ -2577,7 +2633,7 @@ static long ncdev_hbm_scrub_start(struct neuron_device *nd, void *param) { uint32_t eng_id = dma_engines[i]; uint32_t qid = 0; ret = ndmar_queue_init(nd, eng_id, qid, allocated_descs, allocated_descs, tx_mc[dma_engines[i]], - rx_mc[dma_engines[i]], NULL, arg.axi_port, true); + rx_mc[dma_engines[i]], NULL, true); if (ret) { pr_err("Failed to initialize DMA queue for engine %d for scrubbing nd%d HBM %d:\n", eng_id, nd->device_index, arg.hbm_index); goto scrub_init_fail; @@ -3108,13 +3164,14 @@ static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void * return ret; } - /* TODO: start h2d kernel thread */ - if (arg.nc_id >= ndhal->ndhal_address_map.nc_per_device) { pr_err("nd%02d: invalid nc %u provided\n", nd->device_index, arg.nc_id); return -EINVAL; } + /* Set up the completion queue (CQ). The completion thread is created on-demand + * when the first async zero-copy request is submitted. + */ memset(arg.compl_queue_info, 0, sizeof(arg.compl_queue_info)); eng_id = ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id(nd, arg.nc_id); @@ -3144,6 +3201,40 @@ static int ncdev_get_async_h2d_dma_compl_queues(struct neuron_device *nd, void * return ret; } +static int ncdev_host_mem_pin(void *param) +{ + struct neuron_ioctl_host_mem_pin arg; + int ret; + + ret = neuron_copy_from_user(__func__, &arg, param, sizeof(arg)); + if (ret) + return ret; + + arg.pa = ~0ULL; /* default: not contiguous */ + ret = ndma_pin_host_memory(arg.va, arg.size, &arg.pa); + if (ret) + return ret; + + /* Copy result (including pa) back to userspace */ + if (copy_to_user(param, &arg, sizeof(arg))) + return -EFAULT; + + return 0; +} + +static int ncdev_host_mem_unpin(void *param) +{ + struct neuron_ioctl_host_mem_unpin arg; + int ret; + + ret = neuron_copy_from_user(__func__, &arg, param, sizeof(arg)); + if (ret) + return ret; + + /* VA-based unregistration - requires exact VA match */ + return ndma_unpin_host_memory(arg.va); +} + inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsigned long param) { if ((cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK) || (cmd == NEURON_IOCTL_CRWL_NC_RANGE_MARK_EXT0)) { return ncdev_crwl_nc_range_mark(filep, cmd, (void *)param); @@ -3178,6 +3269,10 @@ inline static long ncdev_misc_ioctl(struct file *filep, unsigned int cmd, unsign return ncdev_pod_ctrl(filep, cmd, (void *)param); } else if (_IOC_NR(cmd) == _IOC_NR(NEURON_IOCTL_GET_VA_PLACEMENT)) { return ncdev_get_va_placement((void *)param); + } else if (cmd == NEURON_IOCTL_HOST_MEM_PIN) { + return ncdev_host_mem_pin((void*)param); + } else if (cmd == NEURON_IOCTL_HOST_MEM_UNPIN) { + return ncdev_host_mem_unpin((void*)param); } pr_err("invalid misc IOCTL %d (dir=%d, type=%d, nr=%d, size=%d)\n", cmd, _IOC_DIR(cmd), @@ -3408,7 +3503,7 @@ static int ncdev_open(struct inode *inode, struct file *filep) // wait for device init to complete. // TODO: implement some better wait system than schedule() - while (nd->device_state == NEURON_DEVICE_STATE_RESET) { + while ((nd->device_state == NEURON_DEVICE_STATE_RESET) || ndhal->ndhal_arch.narch_platform_ready(nd, NEURON_PLATFORM_OP_TYPE_DEVOPEN)) { schedule(); if (sigismember(¤t->pending.signal, SIGTERM) || sigismember(¤t->pending.signal, SIGKILL)) { mutex_lock(&dev->ncdev_lock); @@ -3449,6 +3544,24 @@ static inline int ncdev_misc_flush(struct file *filep) return 0; } +/* handle any per process cleanup when the process closes + * the last open handle to a Neuron device + */ +static void ncdev_handle_process_exit_if_last(void) +{ + // is the process still have open handles? + int j; + for (j = 0; j < MAX_NEURON_DEVICE_COUNT; j++) { + struct neuron_device *nd = neuron_pci_get_device(j); + if (nd && npid_is_attached(nd)) { + // not the last + return; + } + } + ndma_pinned_mem_cleanup_process(task_tgid_nr(current)); + // add more cleanup here if necessary +} + static int ncdev_flush(struct file *filep, fl_owner_t id) { struct ncdev *dev; @@ -3470,7 +3583,6 @@ static int ncdev_flush(struct file *filep, fl_owner_t id) if (attach_cnt == 1) { // If this proc exited in the middle of a reset, wait for the reset to be processed. nr_wait(nd, task_tgid_nr(current), true); - ndmar_handle_process_exit(nd, task_tgid_nr(current)); msleep(10); // TODO - confirm with HW dev, whether any delay needed after q reset. ncrwl_release_current_process(nd); @@ -3492,6 +3604,9 @@ static int ncdev_flush(struct file *filep, fl_owner_t id) npid_detach(nd); mutex_unlock(&dev->ncdev_lock); + if (attach_cnt == 1) { + ncdev_handle_process_exit_if_last(); + } return 0; } @@ -3537,6 +3652,13 @@ static int ncdev_mmap(struct file *filep, struct vm_area_struct *vma) return nmmap_mem(nd, vma); } +static unsigned long ncdev_get_unmapped_area(struct file *filep, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return nmmap_get_unmapped_area(filep, addr, len, pgoff, flags); +} + static struct file_operations ncdev_fops = { .owner = THIS_MODULE, .open = ncdev_open, @@ -3544,18 +3666,26 @@ static struct file_operations ncdev_fops = { .release = ncdev_release, .unlocked_ioctl = ncdev_ioctl, .mmap = ncdev_mmap, + .get_unmapped_area = ncdev_get_unmapped_area, }; static ssize_t device_reset_show(struct device *dev, struct device_attribute *attr, char *buf) { int minor = MINOR(dev->devt); - return sprintf(buf, "%d\n", devnodes[minor].ndev->device_state); + if (minor >= NEURON_MAX_DEV_NODES) { + return -ENODEV; + } + return scnprintf(buf, PAGE_SIZE, "%d\n", devnodes[minor].ndev->device_state); } static ssize_t driver_reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + struct ncdev *devnode; int minor = MINOR(dev->devt); - struct ncdev *devnode = &devnodes[minor]; + if (minor >= NEURON_MAX_DEV_NODES) { + return -ENODEV; + } + devnode = &devnodes[minor]; mutex_lock(&devnode->ncdev_lock); if (devnode->open_count == 0) { // only trigger sysfs reset if the device is not opened by app @@ -3570,24 +3700,33 @@ static DEVICE_ATTR(reset, S_IWUSR | S_IRUSR, device_reset_show, driver_reset_sto static ssize_t neuron_core_count_show(struct device *dev, struct device_attribute *attr, char *buf) { - int ret = 0; - ret = sprintf(buf, "%d", ndhal->ndhal_address_map.nc_per_device); - return ret; + // We would prefer to have a newline here for consistency with other output but + // that breaks backward compatibility with the device plugin. They are fixing the + // plugin to handle if a newline is present, so in the future we can add it when + // older plugins are not a concern - https://tiny.amazon.com/k1ezuoub + return scnprintf(buf, PAGE_SIZE, "%d", ndhal->ndhal_address_map.nc_per_device); } -static DEVICE_ATTR(core_count, S_IRUSR, neuron_core_count_show, NULL); +static DEVICE_ATTR(core_count, S_IRUGO, neuron_core_count_show, NULL); #define CONNECTED_DEVICES_MAX_LEN 20 static ssize_t neuron_connected_devices_show(struct device *dev, struct device_attribute *attr, char *buf) { + int ret = 0; int i = 0; int offset = 0; - int minor = MINOR(dev->devt); // neuron device id - struct ncdev *devnode = &devnodes[minor]; - struct neuron_device *nd = devnode->ndev; + struct ncdev *devnode; + struct neuron_device *nd; u32 connected_devices[MAX_NEURON_DEVICE_COUNT]; int connected_device_count = 0; - int ret = ndhal->ndhal_fw_io.fw_io_topology(nd->fw_io_ctx, nd->pdev->device, minor, connected_devices, &connected_device_count); + + int minor = MINOR(dev->devt); // neuron device id + if (minor >= NEURON_MAX_DEV_NODES) { + return -ENODEV; + } + devnode = &devnodes[minor]; + nd = devnode->ndev; + ret = ndhal->ndhal_fw_io.fw_io_topology(nd->fw_io_ctx, nd->pdev->device, minor, connected_devices, &connected_device_count); if (ret) return ret; @@ -3610,27 +3749,53 @@ static ssize_t neuron_connected_devices_show(struct device *dev, struct device_a return offset; } -static DEVICE_ATTR(connected_devices, S_IRUSR, neuron_connected_devices_show, NULL); +static DEVICE_ATTR(connected_devices, S_IRUGO, neuron_connected_devices_show, NULL); static ssize_t fw_api_version_show(struct device *dev, struct device_attribute *attr, char *buf) -{ int fw_api_version; +{ + struct neuron_device *nd; + int fw_api_version; int minor = MINOR(dev->devt); - struct neuron_device *nd = devnodes[minor].ndev; + if (minor >= NEURON_MAX_DEV_NODES) { + return -ENODEV; + } + nd = devnodes[minor].ndev; fw_io_api_version_read(nd->npdev.bar0, &fw_api_version); if (fw_api_version == 0xdeadbeef) { // the value is not readable during reset, try later - return sprintf(buf, "busy\n"); + return scnprintf(buf, PAGE_SIZE, "busy\n"); } - return sprintf(buf, "%u\n", fw_api_version); + return scnprintf(buf, PAGE_SIZE, "%u\n", fw_api_version); } static DEVICE_ATTR(fw_api_version, S_IRUGO, fw_api_version_show, NULL); +static ssize_t fw_build_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + int ret; + struct neuron_device *nd; + u32 fw_build; + int minor = MINOR(dev->devt); + if (minor >= NEURON_MAX_DEV_NODES) { + return -ENODEV; + } + nd = devnodes[minor].ndev; + + ret = fw_io_fw_build_read(nd->npdev.bar0, &fw_build); + if (ret || fw_build == 0xdeadbeef) { // the value is not readable during reset, try later + return scnprintf(buf, PAGE_SIZE, "busy\n"); + } + return scnprintf(buf, PAGE_SIZE, "%u\n", fw_build); +} + +static DEVICE_ATTR(fw_build, S_IRUGO, fw_build_show, NULL); + static struct attribute *attrs[] = { &dev_attr_reset.attr, &dev_attr_core_count.attr, &dev_attr_connected_devices.attr, &dev_attr_fw_api_version.attr, + &dev_attr_fw_build.attr, NULL, }; @@ -3837,6 +4002,22 @@ static ssize_t ncdev_class_ultraserver_mode_show(struct class *class, struct cla return ndhal->ndhal_npe.npe_class_ultraserver_mode_show_data(buf); } +static bool platform_device_initialization_inprogress(void) +{ + return total_neuron_devices == 0; +} + +static bool platform_device_initialization_successful(void) +{ + int i; + for (i = 0; i < total_neuron_devices; i++) { + if (neuron_devices[i] == NULL) { + return false; + } + } + return true; +} + #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 4, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) static ssize_t ncdev_class_hbm_7200_show(const struct class *class, const struct class_attribute *attr, char *buf) #else @@ -3845,10 +4026,15 @@ static ssize_t ncdev_class_hbm_7200_show(struct class *class, struct class_attri { int i; int supports_hbm_7200 = 1; - if (total_neuron_devices == 0) { + + if (platform_device_initialization_inprogress()) { return dhal_sysfs_emit(buf, "busy\n"); } + if (!platform_device_initialization_successful()) { + return dhal_sysfs_emit(buf, "init_error\n"); + } + for (i = 0; i < total_neuron_devices; i++) { if (neuron_devices[i]->supports_hbm_7200 == -1) { return dhal_sysfs_emit(buf, "busy\n"); @@ -3867,10 +4053,15 @@ static ssize_t ncdev_class_cur_perf_profile_show(struct class *class, struct cla { int i; int cur_perf_profile; - if (total_neuron_devices == 0) { + + if (platform_device_initialization_inprogress()) { return dhal_sysfs_emit(buf, "busy\n"); } + if (!platform_device_initialization_successful()) { + return dhal_sysfs_emit(buf, "init_error\n"); + } + cur_perf_profile = neuron_devices[0]->current_perf_profile; for (i = 1; i < total_neuron_devices; i++) { if (neuron_devices[i]->current_perf_profile != cur_perf_profile) { diff --git a/neuron_crwl.c b/neuron_crwl.c index 04a48e1..f528f1c 100644 --- a/neuron_crwl.c +++ b/neuron_crwl.c @@ -219,6 +219,7 @@ int ncrwl_nc_range_mark(u32 nc_count, u32 start_nc_index, u32 end_nc_index, ncrwl_range_mark_cnt++; } mutex_unlock(&ncrwl_range_lock); + pr_info("pid:%d claiming neuron cores: %02d-%02d", task_tgid_nr(current), i, i + nc_count - 1); return 0; } if (*max_range < range_len) @@ -232,9 +233,13 @@ int ncrwl_nc_range_mark(u32 nc_count, u32 start_nc_index, u32 end_nc_index, void ncrwl_nc_range_unmark(volatile long unsigned int *free_map) { int i; + int first = -1; + int last; mutex_lock(&ncrwl_range_lock); for (i = 0; i < MAX_NEURON_DEVICE_COUNT * MAX_NC_PER_DEVICE; i++) { if (test_bit(i, free_map) && ncrwl_range_pids[i] == task_tgid_nr(current)) { + first = (first == -1) ? i : first; + last = i; ncrwl_range_pids[i] = 0; ncrwl_range_mark_cnt--; } @@ -242,6 +247,9 @@ void ncrwl_nc_range_unmark(volatile long unsigned int *free_map) ndhal->ndhal_npe.npe_notify_mark(ncrwl_range_mark_cnt, false); } mutex_unlock(&ncrwl_range_lock); + if (first != -1) { + pr_info("pid:%d releasing neuron core in range: %02d-%02d", task_tgid_nr(current), first, last); + } } int ncrwl_nc_range_pid_get( uint32_t nc_index, pid_t *pid) diff --git a/neuron_device.h b/neuron_device.h index 8af4f29..fa67c5b 100644 --- a/neuron_device.h +++ b/neuron_device.h @@ -71,6 +71,7 @@ struct neuron_device { struct pci_dev *pdev; int device_index; volatile enum neuron_device_state device_state; // current state of this device + struct mutex lock; // serialize neuron_device access when device-wide exclusion is needed // all the processes that are opened this device struct neuron_attached_process attached_processes[NEURON_MAX_PROCESS_PER_DEVICE]; @@ -126,6 +127,9 @@ struct neuron_device { // volatile to prevent compiler optimizations since accessed by different threads // This is the true value per-device, instead of the global one in ndhal_perf used only for metrics volatile int current_perf_profile; + + // DMA completion thread for async IO + struct ndma_h2d_dma_cmpltn_thread dma_cmpltn_thread; }; #endif diff --git a/neuron_dhal.h b/neuron_dhal.h index dbce141..be38ead 100644 --- a/neuron_dhal.h +++ b/neuron_dhal.h @@ -26,6 +26,7 @@ struct ndhal_arch { int arch; enum neuron_platform_type platform_type; u32 server_id; + int (*narch_platform_ready) (struct neuron_device *nd, enum neuron_platform_operation_type platform_operation); }; struct ndhal_address_map { // addresses @@ -36,7 +37,6 @@ struct ndhal_address_map { uint64_t mmap_nc_sema_incr_offset; uint64_t mmap_nc_sema_decr_offset; uint64_t bar0_misc_ram_offset; - uint64_t port_1_base; // counts int nc_per_device; @@ -90,6 +90,8 @@ struct ndhal_mpset { struct ndhal_ndmar { uint32_t (*ndmar_get_h2t_eng_id) (struct neuron_device *nd, uint32_t nc_id); int (*ndmar_get_h2t_def_qid) (uint32_t nc_id); + int (*ndmar_ctx_queue_bit) (uint32_t h2d_eng_id, uint32_t qid); + void (*ndmar_ctx_queue_from_bit) (int bit, uint32_t *h2d_eng_id, uint32_t *qid); bool (*ndmar_is_h2t_def_q) (struct neuron_device *nd, uint32_t eng_id, uint32_t q_id); bool (*nr_init_h2t_eng) ( int nc_idx, uint32_t nc_map); bool (*ndmar_is_nx_ring) (uint32_t eng_id, uint32_t q_id); @@ -97,6 +99,7 @@ struct ndhal_ndmar { }; struct ndhal_fw_io { + u32 new_readless_read_min_api_version; int (*fw_io_topology) (struct fw_io_ctx *ctx, int pdev_index, int device_id, u32 *connected_device_ids, int *count); int (*fw_io_register_readless_read_region) (struct fw_io_ctx *ctx, void __iomem *bar0, u64 bar0_size, void __iomem *bar2, u64 bar2_size); int (*fw_io_read_csr_array) (void **addrs, u32 *values, u32 num_csrs, bool operational); @@ -131,6 +134,8 @@ struct ndhal_sysfs_metrics { int nc_id, int tensor_engine_attrs_info_tbl_cnt, const nsysfsmetric_attr_info_t *tensor_engine_attr_info_tbl); + + bool health_status_enabled; }; struct ndhal_pci { diff --git a/neuron_dma.c b/neuron_dma.c index 32e7d43..901048b 100644 --- a/neuron_dma.c +++ b/neuron_dma.c @@ -11,6 +11,9 @@ #include #include #include +#include +#include + #include "udma/udma.h" #include "neuron_trace.h" @@ -33,6 +36,37 @@ MODULE_PARM_DESC(zerocopy_trn1_override, "override zerocopy for trn1"); struct neuron_device; +/* data structures for explicit pin/unpin API */ + +/** + * struct neuron_pinned_mem - Tracks a pre-pinned host memory region + * @va: User virtual address that was pinned (lookup key) + * @size: Size of the pinned region in bytes + * @nr_pages: Number of pages pinned + * @pages: Array of pinned page pointers + * @rb_node: Red-black tree node for efficient VA lookup + * + * Process isolation is structural: each process has its own rbtree + * in the global hash table, so no pid field is needed here. + */ +struct neuron_pinned_mem { + u64 va; /* lookup key - user virtual address */ + u64 size; + unsigned long nr_pages; + struct page **pages; + struct rb_node rb_node; /* for VA-based lookup */ +}; + +/* Per-process pinned memory state */ +struct neuron_pinned_mem_process { + pid_t pid; + struct rb_root root; /* rbtree of pinned regions for this process */ + struct mutex lock; /* protects this process's rbtree */ + struct kref refcount; /* lifetime management; freed when last ref drops */ + struct hlist_node hash_node; /* for hash table lookup */ +}; + + static void ndma_ack_completed_desc(struct ndma_eng *eng, struct ndma_ring *ring, u32 count) { struct udma_q *rxq, *txq; @@ -935,6 +969,10 @@ struct ndma_h2t_zcdma_context { struct page **page_list; // page structures tracking our pinned pages; // managed by page_list_pool in ctx queue enum ndma_zcdma_state state; // state of this transfer + struct neuron_pinned_mem_process *prepin_proc; // ref counted ptr to per process store of pinned memories + // when set indicates that 1/ the context uses pre-pinned mem and it should not be unpinned + // 2/ prevents process exit cleanup from unpinning the memory while used by the context + pid_t pid; // PID of the process that initiated the copy // Completion-related void *completion_ptr; // completion buffer pointer; @@ -946,20 +984,29 @@ struct ndma_h2t_zcdma_context { struct mm_struct *mm; // mm that owns the user buffers }; +static void ndma_pinned_mem_process_release(struct kref *kref); + static void ndma_zc_release_ctx(struct ndma_h2t_zcdma_context *ctx, u64 *nr_pinned_pages) { // do not free or set completion_ptr null. it is managed by completion_pool in ctx queue // do not free or set page_list null. it is managed by page_list_pool in ctx queue if (ctx->state >= NDMA_PINNED_UNSUBMITTED) { - if (ctx->direction) { - unpin_user_pages(ctx->page_list, ctx->nr_pages); + /* Only unpin if we pinned it ourselves (not pre-pinned memory) */ + if (!ctx->prepin_proc) { + if (ctx->direction) { + unpin_user_pages(ctx->page_list, ctx->nr_pages); + } else { + unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true); + } } else { - unpin_user_pages_dirty_lock(ctx->page_list, ctx->nr_pages, true); + kref_put(&ctx->prepin_proc->refcount, ndma_pinned_mem_process_release); } + *nr_pinned_pages -= ctx->nr_pages; } ctx->nr_pages = 0; + ctx->prepin_proc = NULL; if (ctx->mm) { mmput(ctx->mm); @@ -1300,7 +1347,7 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue) } // allocate page_list arrays in one contiguous pool, and let each entry point to its slice - queue->page_list_pool = kcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); + queue->page_list_pool = kvcalloc(capacity * NDMA_ZC_PAGES_PER_XFER, sizeof(struct page *), GFP_KERNEL); if (!queue->page_list_pool) { pr_err("failed to allocate ctx queue page_list pool\n"); goto err; @@ -1321,7 +1368,7 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue) queue->completion_pool = NULL; } if (queue->page_list_pool) { - kfree(queue->page_list_pool); + kvfree(queue->page_list_pool); queue->page_list_pool = NULL; } if (queue->entries) { @@ -1333,6 +1380,10 @@ int ndma_ctx_queue_init(struct ndma_ctx_queue *queue) void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct ndma_ctx_queue *queue) { + int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng->eng_id, ring->qid); + + atomic64_andnot(BIT_ULL(bit), &eng->nd->dma_cmpltn_thread.nonempty_ctxq_bitmap); + if (!queue) { return; } @@ -1346,7 +1397,7 @@ void ndma_ctx_queue_free(struct ndma_eng *eng, struct ndma_ring *ring, struct nd queue->completion_pool = NULL; } if (queue->page_list_pool) { - kfree(queue->page_list_pool); + kvfree(queue->page_list_pool); queue->page_list_pool = NULL; } memset(queue, 0, sizeof(*queue)); @@ -1533,6 +1584,8 @@ static bool ndma_zc_should_wait(struct ndma_eng *eng, return false; } +static bool ndma_pinned_mem_try_populate(pid_t pid, u64 va, u64 size, struct page **page_list, int nr_pages, struct neuron_pinned_mem_process **prepin_proc); + static int ndma_zerocopy_pin_pages(int nd_id, u32 nc_id, struct ndma_ctx_queue *ctx_queue, @@ -1540,6 +1593,16 @@ static int ndma_zerocopy_pin_pages(int nd_id, bool use_remote_pin) { int nr_pinned = 0; + struct neuron_pinned_mem_process *prepin_proc = NULL; + + /* Check if this VA range is in pre-pinned memory */ + if (ndma_pinned_mem_try_populate(dma_ctx->pid, (u64)dma_ctx->host_addr, dma_ctx->size, + dma_ctx->page_list, dma_ctx->nr_pages, &prepin_proc)) { + dma_ctx->prepin_proc = prepin_proc; + ctx_queue->nr_pinned_pages += dma_ctx->nr_pages; + dma_ctx->state = NDMA_PINNED_UNSUBMITTED; + return 0; + } if (use_remote_pin) { if (!dma_ctx->mm) { @@ -1627,6 +1690,13 @@ int ndma_zerocopy_submit(struct neuron_device *nd, return -ENOENT; } + if (async) { + ret = ndma_h2d_create_cmpltn_thread(nd); + if (ret) { + return ret; + } + } + mutex_lock(&ring->h2t_ring_lock); for (i = 0; i < num_ops; i++) { @@ -1685,6 +1755,7 @@ int ndma_zerocopy_submit(struct neuron_device *nd, cur_ctx->state = NDMA_UNPINNED; cur_ctx->nr_desc = 0; // Set by ndma_build_n_issue_zc_descs(). cur_ctx->mm = NULL; + cur_ctx->pid = task_tgid_nr(current); cur_ctx->sequence_num = sequence_num; /* Pin now if possible; otherwise capture mm for remote pinning (async only). */ @@ -1770,25 +1841,33 @@ int ndma_zerocopy_submit(struct neuron_device *nd, if (ret) { ndma_ctx_queue_drain(eng, ring, ctx_queue); } + mutex_unlock(&ring->h2t_ring_lock); + + if (!ret && async) { + int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng_id, qid); + atomic64_or(BIT_ULL(bit), &nd->dma_cmpltn_thread.nonempty_ctxq_bitmap); // set the bit for this queue + wake_up(&nd->dma_cmpltn_thread.wait_queue); + } + return ret; } /* The completion flow for completion, remote pinning, and submission. Async IO only */ -static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, - struct ndma_eng *eng, - struct ndma_ring *ring, - bool *did_work) +static int ndma_zerocopy_complete(struct neuron_device *nd, + struct ndma_eng *eng, + struct ndma_ring *ring, + u64 *nonempty_ctxq_bitmap_copy) { int ret = 0; int err = 0; + bool did_work = false; struct ndma_ctx_queue *ctx_queue = NULL; u32 desc_threshold = NDMA_ZC_DESC_WAIT_THRESHOLD_LO; - if (!ring || !did_work) { + if (!ring) { return -EINVAL; } - *did_work = false; ctx_queue = &ring->dma_ctx_queue; @@ -1799,7 +1878,12 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, if (ndma_ctx_queue_submitted_empty(ctx_queue)) { break; } - if (*did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) { + /* + * Async completion must always retire at least one submitted context. + * Only fall back to the wait-throttling heuristic after we have made + * some forward progress in this pass. + */ + if (did_work && !ndma_zc_should_wait(eng, ring, ctx_queue, &desc_threshold)) { break; } struct ndma_h2t_zcdma_context *submitted_ctx = ndma_ctx_queue_pop_submitted(ctx_queue); @@ -1815,8 +1899,7 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, } ndma_zc_release_ctx(submitted_ctx, &ctx_queue->nr_pinned_pages); - - *did_work = true; + did_work = true; } /* 2) Submit pinned but unsubmitted contexts */ @@ -1836,8 +1919,7 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, } else { ndma_ctx_queue_inc_first_pinned_unsubmitted(ctx_queue); } - - *did_work = true; + did_work = true; } /* 3) Remote pin unpinned contexts */ @@ -1857,10 +1939,478 @@ static __maybe_unused int ndma_zerocopy_complete(struct neuron_device *nd, } else { ndma_ctx_queue_inc_first_unpinned(ctx_queue); } - - *did_work = true; + did_work = true; } mutex_unlock(&ring->h2t_ring_lock); + + if (ndma_ctx_queue_is_empty(ctx_queue)) { + int bit = ndhal->ndhal_ndmar.ndmar_ctx_queue_bit(eng->eng_id, ring->qid); + *nonempty_ctxq_bitmap_copy &= ~BIT_ULL(bit); + } + return err; } + +static int ndma_h2d_cmpltn_thread_fn(void *arg) +{ + struct neuron_device *nd = (struct neuron_device *)arg; + int ret = 0; + + while (!kthread_should_stop() && !nd->dma_cmpltn_thread.stop) { + wait_event_interruptible(nd->dma_cmpltn_thread.wait_queue, + nd->dma_cmpltn_thread.stop || atomic64_read(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap) != 0); + if (kthread_should_stop() || nd->dma_cmpltn_thread.stop) { + break; + } + u64 bitmap = atomic64_xchg(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap, 0); + + while (bitmap) { + int bit = __ffs64(bitmap); + u32 eng_id; + u32 qid; + struct ndma_eng *eng; + struct ndma_ring *ring; + + ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit(bit, &eng_id, &qid); + + eng = &nd->ndma_engine[eng_id]; + ring = &eng->queues[qid].ring_info; + ret = ndma_zerocopy_complete(nd, eng, ring, &bitmap); + if (ret) { + pr_err("dma completion thread failed to process ctx queue for eng %d q %d: %d\n", eng_id, qid, ret); + } + } + } + + return ret; +} + +int ndma_h2d_create_cmpltn_thread(struct neuron_device *nd) +{ + int ret = 0; + struct task_struct *thread; + + if (READ_ONCE(nd->dma_cmpltn_thread.thread)) { + return 0; + } + + mutex_lock(&nd->lock); + + if (nd->dma_cmpltn_thread.thread) { + /* thread already created */ + goto out; + } + + nd->dma_cmpltn_thread.stop = false; + init_waitqueue_head(&nd->dma_cmpltn_thread.wait_queue); + atomic64_set(&nd->dma_cmpltn_thread.nonempty_ctxq_bitmap, 0); + thread = kthread_run(ndma_h2d_cmpltn_thread_fn, nd, "neuron dma cmpltn"); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + pr_err("h2d dma completion thread creation failed\n"); + goto out; + } + WRITE_ONCE(nd->dma_cmpltn_thread.thread, thread); + +out: + mutex_unlock(&nd->lock); + return ret; +} + +void ndma_h2d_stop_cmpltn_thread(struct neuron_device *nd) +{ + if (!nd->dma_cmpltn_thread.thread) { + return; + } + if (IS_ERR(nd->dma_cmpltn_thread.thread)) { + nd->dma_cmpltn_thread.thread = NULL; + return; + } + + nd->dma_cmpltn_thread.stop = true; + wake_up(&nd->dma_cmpltn_thread.wait_queue); + kthread_stop(nd->dma_cmpltn_thread.thread); + nd->dma_cmpltn_thread.thread = NULL; +} + +/* + * Pre-pinned host memory implementation + * Uses a global hash table keyed by PID, with each process having its own + * rbtree of pinned memory regions keyed by VA. + * Host memory is not device-specific — a process can pin via any device + * and the zerocopy path on any device will find the pre-pinned region. + */ + +/* 256 buckets: up to 16 devices × 16 processes per device */ +static DEFINE_HASHTABLE(pinned_mem_htable, 8); +static DEFINE_MUTEX(pinned_mem_htable_lock); /* protects hash table add/remove/lookup only */ + +/* + * Find or create per-process state and take a reference. + * Caller must hold pinned_mem_htable_lock; caller owns the returned ref. + */ +static struct neuron_pinned_mem_process *ndma_pinned_mem_get_process_locked(pid_t pid) +{ + struct neuron_pinned_mem_process *proc; + + hash_for_each_possible(pinned_mem_htable, proc, hash_node, pid) { + if (proc->pid == pid) { + kref_get(&proc->refcount); + return proc; + } + } + + proc = kzalloc(sizeof(*proc), GFP_KERNEL); + if (!proc) + return NULL; + proc->pid = pid; + proc->root = RB_ROOT; + mutex_init(&proc->lock); + kref_init(&proc->refcount); /* hash table holds initial ref */ + hash_add(pinned_mem_htable, &proc->hash_node, pid); + kref_get(&proc->refcount); /* caller's operational ref */ + return proc; +} + +/* + * Find per-process state and take a reference. + * Caller must hold pinned_mem_htable_lock; caller owns the returned ref. + * Returns NULL if not found (no ref taken). + */ +static struct neuron_pinned_mem_process *ndma_pinned_mem_find_process_locked(pid_t pid) +{ + struct neuron_pinned_mem_process *proc; + + hash_for_each_possible(pinned_mem_htable, proc, hash_node, pid) { + if (proc->pid == pid) { + kref_get(&proc->refcount); + return proc; + } + } + return NULL; +} + +static void ndma_pinned_mem_free_entry(struct neuron_pinned_mem *entry) +{ + if (entry->pages) { + unpin_user_pages(entry->pages, entry->nr_pages); + kvfree(entry->pages); + } + kfree(entry); +} + +static void ndma_pinned_mem_destroy_tree(struct rb_root *root) +{ + struct rb_node *node; + + while ((node = rb_first(root)) != NULL) { + struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node); + rb_erase(node, root); + ndma_pinned_mem_free_entry(entry); + } +} + +void ndma_pinned_mem_destroy(void) +{ + struct neuron_pinned_mem_process *proc; + struct hlist_node *tmp; + int bkt; + + mutex_lock(&pinned_mem_htable_lock); + hash_for_each_safe(pinned_mem_htable, bkt, tmp, proc, hash_node) { + hash_del(&proc->hash_node); + ndma_pinned_mem_destroy_tree(&proc->root); + kfree(proc); + } + mutex_unlock(&pinned_mem_htable_lock); +} + +static void ndma_pinned_mem_process_release(struct kref *kref) +{ + struct neuron_pinned_mem_process *proc = + container_of(kref, struct neuron_pinned_mem_process, refcount); + mutex_lock(&proc->lock); // this is likely unnecessary because when we get here proc has been removed from the hash table + // on process exit and nobody can find this entry anymore + ndma_pinned_mem_destroy_tree(&proc->root); + mutex_unlock(&proc->lock); + kfree(proc); +} + +/* Find by exact VA match (for unpin) - caller must hold lock */ +static struct neuron_pinned_mem *ndma_pinned_mem_find_exact_locked(struct rb_root *root, u64 va) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node); + + if (va < entry->va) + node = node->rb_left; + else if (va > entry->va) + node = node->rb_right; + else + return entry; /* exact match */ + } + return NULL; +} + +/* Find region containing VA range (for zerocopy) - caller must hold lock */ +static struct neuron_pinned_mem *ndma_pinned_mem_find_containing_locked(struct rb_root *root, u64 va, u64 size) +{ + struct rb_node *node = root->rb_node; + u64 va_end = va + size; + + while (node) { + struct neuron_pinned_mem *entry = rb_entry(node, struct neuron_pinned_mem, rb_node); + u64 entry_end = entry->va + entry->size; + + if (va_end <= entry->va) { + /* Range is entirely before this entry */ + node = node->rb_left; + } else if (va >= entry_end) { + /* Range is entirely after this entry */ + node = node->rb_right; + } else if (va >= entry->va && va_end <= entry_end) { + /* Range is fully contained within this entry */ + return entry; + } else { + /* Partial overlap - not supported, return NULL */ + return NULL; + } + } + return NULL; +} + +/* Insert into rbtree - caller must hold lock */ +static int ndma_pinned_mem_insert_locked(struct rb_root *root, struct neuron_pinned_mem *new) +{ + struct rb_node **link = &root->rb_node; + struct rb_node *parent = NULL; + u64 new_end = new->va + new->size; + + while (*link) { + struct neuron_pinned_mem *entry = rb_entry(*link, struct neuron_pinned_mem, rb_node); + u64 entry_end = entry->va + entry->size; + + parent = *link; + if (new->va < entry->va) { + /* Check for overlap */ + if (new_end > entry->va) + return -EEXIST; /* overlaps */ + link = &(*link)->rb_left; + } else if (new->va > entry->va) { + /* Check for overlap */ + if (new->va < entry_end) + return -EEXIST; /* overlaps */ + link = &(*link)->rb_right; + } else { + return -EEXIST; /* exact duplicate */ + } + } + + rb_link_node(&new->rb_node, parent, link); + rb_insert_color(&new->rb_node, root); + return 0; +} + +/** + * ndma_check_pages_contiguous() - Check if pinned pages are physically contiguous + * @pages: Array of pinned pages + * @nr_pages: Number of pages + * @offset: Byte offset within the first page + * + * Return: Physical address of the start of the region if all pages are + * contiguous, or ~0ULL if they are not. + */ +static u64 ndma_check_pages_contiguous(struct page **pages, unsigned long nr_pages, unsigned long offset) +{ + unsigned long i; + + for (i = 1; i < nr_pages; i++) { + if (page_to_phys(pages[i]) != page_to_phys(pages[i - 1]) + PAGE_SIZE) + return ~0ULL; + } + return (page_to_phys(pages[0]) + offset) | ndhal->ndhal_address_map.pci_host_base; +} + +int ndma_pin_host_memory(u64 va, u64 size, u64 *pa_out) +{ + struct neuron_pinned_mem *entry; + struct neuron_pinned_mem_process *proc; + unsigned long offset = va & (PAGE_SIZE - 1); + unsigned long nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + int ret; + long pinned; + + if (va == 0 || size == 0) + return -EINVAL; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!entry->pages) { + ret = -ENOMEM; + goto err_free_entry; + } + + /* Try fast path first - doesn't require mmap_lock */ + pinned = pin_user_pages_fast(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages); + if (pinned < 0 || pinned < nr_pages) { + /* Fast path failed or incomplete - fall back to slow path */ + if (pinned > 0) + unpin_user_pages(entry->pages, pinned); + + /* Slow path with mmap_lock */ + mmap_read_lock(current->mm); +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 5, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 6))) + pinned = pin_user_pages(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages); +#else + pinned = pin_user_pages(va & PAGE_MASK, nr_pages, FOLL_WRITE | FOLL_LONGTERM, entry->pages, NULL); +#endif + mmap_read_unlock(current->mm); + + if (pinned < 0) { + pr_err("failed to pin pages: %ld\n", pinned); + ret = pinned; + goto err_free_pages; + } + if (pinned < nr_pages) { + pr_err("could not pin all pages: %ld/%lu\n", pinned, nr_pages); + unpin_user_pages(entry->pages, pinned); + ret = -EFAULT; + goto err_free_pages; + } + } + + entry->va = va; + entry->size = size; + entry->nr_pages = nr_pages; + RB_CLEAR_NODE(&entry->rb_node); + + mutex_lock(&pinned_mem_htable_lock); + proc = ndma_pinned_mem_get_process_locked(task_tgid_nr(current)); + mutex_unlock(&pinned_mem_htable_lock); + if (!proc) { + ret = -ENOMEM; + goto err_unpin; + } + // here and elsewhere, slightly non-obvious. + // we ref counting proc to make sure it's not deleted in the + // unlikely case the process is detached while we are here. Not + // possible to happen in this function because it's called from IOCTL + // but a general pattern is to 1/ lock the hashtable 2/ return ref counted + // proc entry, 3/ operate on the entry and 4/ decrement the count + // this is specifically relevant for async zerocopy case getting pinned pages + // from proc because it's running as an independent thread. + mutex_lock(&proc->lock); + ret = ndma_pinned_mem_insert_locked(&proc->root, entry); + mutex_unlock(&proc->lock); + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + if (ret) { + pr_err("Failed to register, likely due to app failure to unpin previous mmap()\n"); + goto err_unpin; + } + + /* Report contiguous PA if all pinned pages are physically adjacent. */ + if (pa_out) + *pa_out = ndma_check_pages_contiguous(entry->pages, nr_pages, offset); + + return 0; + +err_unpin: + unpin_user_pages(entry->pages, nr_pages); +err_free_pages: + kvfree(entry->pages); +err_free_entry: + kfree(entry); + return ret; +} + +int ndma_unpin_host_memory(u64 va) +{ + struct neuron_pinned_mem *entry; + struct neuron_pinned_mem_process *proc; + + mutex_lock(&pinned_mem_htable_lock); + proc = ndma_pinned_mem_find_process_locked(task_tgid_nr(current)); + mutex_unlock(&pinned_mem_htable_lock); + if (!proc) + return -ENOENT; + + mutex_lock(&proc->lock); + entry = ndma_pinned_mem_find_exact_locked(&proc->root, va); + if (!entry) { + mutex_unlock(&proc->lock); + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + return -ENOENT; + } + + rb_erase(&entry->rb_node, &proc->root); + mutex_unlock(&proc->lock); + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + + ndma_pinned_mem_free_entry(entry); + return 0; +} + +/* Used by zero-copy API to use pinned pages instead on pinning on demand + * the copy can run either as part of IOCTL or in async thread, it takes PID + * of the process that pinned the pages. + */ +static bool ndma_pinned_mem_try_populate(pid_t pid, u64 va, u64 size, struct page **page_list, int nr_pages, struct neuron_pinned_mem_process **prepin_proc) +{ + struct neuron_pinned_mem_process *proc; + struct neuron_pinned_mem *entry; + bool found = false; + + *prepin_proc = NULL; + + mutex_lock(&pinned_mem_htable_lock); + proc = ndma_pinned_mem_find_process_locked(pid); + mutex_unlock(&pinned_mem_htable_lock); + + if (proc) { + mutex_lock(&proc->lock); + entry = ndma_pinned_mem_find_containing_locked(&proc->root, va, size); + if (entry) { + unsigned long va_start = va & PAGE_MASK; + unsigned long pinned_va_start = entry->va & PAGE_MASK; + unsigned long page_offset = (va_start - pinned_va_start) >> PAGE_SHIFT; + int i; + + for (i = 0; i < nr_pages; i++) + page_list[i] = entry->pages[page_offset + i]; + found = true; + } + mutex_unlock(&proc->lock); + if (found) { + *prepin_proc = proc; + } else { // we are holding a ref count for proc, but we did not find/copy any pages + // so we don't need to hold on to the proc + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + } + } + + return found; +} + +void ndma_pinned_mem_cleanup_process(pid_t pid) +{ + struct neuron_pinned_mem_process *proc; + + mutex_lock(&pinned_mem_htable_lock); + proc = ndma_pinned_mem_find_process_locked(pid); + if (proc) + hash_del(&proc->hash_node); /* prevent new lookups */ + mutex_unlock(&pinned_mem_htable_lock); + + if (proc) { + /* Drop the find ref; the hash_del above means no new refs can be taken */ + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + /* Drop the hash table's initial ref — frees proc when last user is done, when ref count is 0 rb tree is deleted and everything is unpinned */ + kref_put(&proc->refcount, ndma_pinned_mem_process_release); + } +} diff --git a/neuron_dma.h b/neuron_dma.h index eeea14f..0661a49 100644 --- a/neuron_dma.h +++ b/neuron_dma.h @@ -262,4 +262,45 @@ int ndma_zerocopy_submit(struct neuron_device *nd, bool direction, u64 sequence_num); +/** + * Pre-pinned host memory support + * + * Allows userspace to pin host memory once and reuse it for multiple + * DMA transfers without the overhead of pinning/unpinning on each transfer. + * Uses VA as the lookup key - zerocopy operations auto-detect pinned memory. + */ + +/** + * ndma_pinned_mem_destroy() - Cleanup pinned memory tracking subsystem + */ +void ndma_pinned_mem_destroy(void); + +/** + * ndma_pin_host_memory() - Pin host memory for accelerated DMA operations + * @va: User virtual address to pin + * @size: Size of memory to pin + * + * Pins host memory so zerocopy operations auto-detect pinned regions + * and skip per-transfer pinning. Uses fast path (pin_user_pages_fast) + * first, then falls back to slow path (pin_user_pages with mmap_lock) + * if needed. + * + * Return: 0 on success, -EEXIST if already pinned, negative errno on failure + */ +int ndma_pin_host_memory(u64 va, u64 size, u64 *pa_out); + +/** + * ndma_unpin_host_memory() - Unpin previously pinned host memory + * @va: VA that was used in ndma_pin_host_memory (exact match required) + * + * Return: 0 on success, -ENOENT if not found, -EPERM if not owner + */ +int ndma_unpin_host_memory(u64 va); + +/** + * ndma_pinned_mem_cleanup_process() - Cleanup all pinned memory for a process + * @pid: Process ID to cleanup + */ +void ndma_pinned_mem_cleanup_process(pid_t pid); + #endif diff --git a/neuron_fw_io.c b/neuron_fw_io.c index dbf9133..186c390 100644 --- a/neuron_fw_io.c +++ b/neuron_fw_io.c @@ -54,6 +54,21 @@ int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count) return 0; } +int fw_io_misc_ram_reg_read(void *bar0, u64 offset, u32 *val) +{ + if (offset % 4 != 0) { + pr_err("invalid misc ram offset, needs to be 4 byte aligned\n"); + return -EPROTO; + } + void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + offset; + int ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, val, 1, true); + if (ret) { + pr_err("failed to read misc ram reg at offset 0x%llx\n", offset); + return -EIO; + } + return 0; +} + int fw_io_hbm_uecc_repair_state_read(void *bar0, uint32_t *hbm_repair_state) { int ret; @@ -133,6 +148,19 @@ int fw_io_api_version_read(void * bar0, u32 *version) return ret; } +int fw_io_fw_build_read(void *bar0, u32 *fw_build) +{ + int ret; + + void *addr = bar0 + ndhal->ndhal_address_map.bar0_misc_ram_offset + FW_IO_REG_FW_BUILD_OFFSET; + ret = ndhal->ndhal_fw_io.fw_io_read_csr_array(&addr, fw_build, 1, true); + if (ret) { + pr_err("failed to get fw build from the device, ret = %d\n", ret); + } + + return ret; +} + int fw_io_server_info_read(void *bar0, int *server_id, int * rack_id) { int ret; @@ -385,7 +413,7 @@ int fw_io_execute_request(struct fw_io_ctx *ctx, u8 command_id, const u8 *req, u goto done; } ctx->fw_io_err_count++; - pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, + pr_err("seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, ctx->response->response_hdr.hdr.error_code); // if we get an unsupported command response, don't retry if (ctx->response->response_hdr.hdr.error_code == FW_IO_UNKNOWN_COMMAND) { @@ -413,13 +441,13 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re ret = fw_io_api_version_read(ctx->bar0, &api_version_num); - if ((ret != 0) || (api_version_num < FW_IO_NEW_READLESS_READ_MIN_API_VERSION)) { + if ((ret != 0) || (api_version_num < ndhal->ndhal_fw_io.new_readless_read_min_api_version)) { pr_info_once("Firmware version %d, using legacy Firmware/Runtime comm framework", api_version_num); return -ENOTSUPP; } mutex_lock(&ctx->lock); - + ret = -EIO; u32 retry_count = (command_id < FW_IO_CMD_MAX) ? fw_io_cmd_retry_tbl[command_id] : FW_IO_RD_RETRY; for (i=0; i < retry_count; i++){ if (++ctx->next_seq_num == 0) @@ -463,6 +491,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re if (trigger) { if (command_id != FW_IO_CMD_POST_TO_CW) pr_err("seq: %u, cmd: %u timed out\n", ctx->next_seq_num, command_id); + ret = -ETIMEDOUT; continue; } @@ -473,6 +502,7 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re if (resp_header.hdr.sequence_number != ctx->next_seq_num) { if (command_id != FW_IO_CMD_POST_TO_CW) pr_err("seq: %u, cmd: %u seq mismatch\n", ctx->next_seq_num, command_id); + ret = -EPROTO; continue; } @@ -496,8 +526,8 @@ int fw_io_execute_request_new(struct fw_io_ctx *ctx, u8 command_id, const u8 *re } ctx->fw_io_err_count++; - pr_err(KERN_ERR "seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code); - ret = -1; + pr_err("seq: %u, cmd: %u failed %u\n", ctx->next_seq_num, command_id, resp_header.hdr.error_code); + ret = -EIO; if (resp_header.hdr.error_code == FW_IO_UNKNOWN_COMMAND) { break; } @@ -775,7 +805,7 @@ void fw_io_destroy(struct fw_io_ctx *ctx) kfree(ctx); } -static inline uint32_t uncorrectable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) { +static inline uint32_t unrepairable_ecc_err_count(uint32_t api_version, uint32_t ecc_err_count) { // API Version<6: bitfield[15:0] Uncorrectable Errors // API Version>=6: bitfield[15:12] Uncorrectable Errors return (api_version >= 6) ? ((ecc_err_count >> 12) & 0xf) : (ecc_err_count & 0xffff); @@ -787,8 +817,8 @@ static inline uint32_t repairable_ecc_err_count(uint32_t api_version, uint32_t e return (api_version >= 6) ? (ecc_err_count & 0xfff) : 0; } -void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t *repairable_ecc_count) { - uint32_t total_uncorrected_ecc_err_count = 0; +void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *unrepairable_ecc_count, uint32_t *repairable_ecc_count) { + uint32_t total_unrepairable_ecc_err_count = 0; uint32_t total_repairable_ecc_err_count = 0; uint32_t channel = 0; uint32_t ecc_err_count = 0; @@ -804,11 +834,11 @@ void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t if (ret) { pr_err("sysfs failed to read ECC HBM%u error from FWIO\n", channel); } else if (ecc_err_count != 0xdeadbeef) { - total_uncorrected_ecc_err_count += uncorrectable_ecc_err_count(api_version, ecc_err_count); + total_unrepairable_ecc_err_count += unrepairable_ecc_err_count(api_version, ecc_err_count); total_repairable_ecc_err_count += repairable_ecc_err_count(api_version, ecc_err_count); } } - *ue_ecc_count = total_uncorrected_ecc_err_count; + *unrepairable_ecc_count = total_unrepairable_ecc_err_count; *repairable_ecc_count = total_repairable_ecc_err_count; } diff --git a/neuron_fw_io.h b/neuron_fw_io.h index 83a5709..ac91b98 100644 --- a/neuron_fw_io.h +++ b/neuron_fw_io.h @@ -174,8 +174,8 @@ enum { // All devices will have the D0 offset. Devices with two dice will also have the D1 offset. FW_IO_REG_POWER_UTIL_D0_OFFSET = 0x54, // 21 * 4 bytes FW_IO_REG_POWER_UTIL_D1_OFFSET = 0x58, // 22 * 4 bytes - FW_IO_REG_HBM_REPAIR_STATE_OFFSET = 0x64, // 25 * 4 bytes + FW_IO_REG_FW_BUILD_OFFSET = 0x74, // 29 * 4 bytes // FW_IO_REG_RESERVATION_ID_HI = 0x80, // 32 * 4 bytes @@ -273,10 +273,6 @@ struct fw_io_ctx { // max number of registers can be read in single function call #define FW_IO_MAX_READLESS_READ_REGISTER_COUNT 100 -// Min Firmware API version for new readless read framework -#define FW_IO_NEW_READLESS_READ_MIN_API_VERSION 7 -#define FW_IO_POWER_MIN_API_VERSION 3 - /** * fw_io_register_read_region - Read a BAR region @@ -467,6 +463,14 @@ int fw_io_device_power_read(void *bar0, u32 *power, unsigned die); */ int fw_io_api_version_read(void * bar0, u32 *version); +/** + * fw_io_fw_build_read() - Read the firmware build number + * @param bar - from bar + * @param fw_build - output firmware build number + * @return 0 on success. + */ +int fw_io_fw_build_read(void *bar0, u32 *fw_build); + /** * fw_io_device_id_write() - Read device id * @param bar - to bar @@ -497,6 +501,16 @@ u64 fw_io_get_err_count(struct fw_io_ctx *ctx); */ int fw_io_ecc_read(void *bar0, uint64_t ecc_offset, uint32_t *ecc_err_count); +/** + * fw_io_misc_ram_reg_read() - Read a single 32-bit misc RAM register by byte offset + * + * @param bar0: mapped BAR0 base + * @param offset: byte offset of the register within the misc RAM block (e.g., FW_IO_REG_*_OFFSET) + * @param val: output register value + * @return 0 on success + */ +int fw_io_misc_ram_reg_read(void *bar0, u64 offset, u32 *val); + /** * fw_io_serial_number_read() - Read serial number * @@ -509,10 +523,10 @@ int fw_io_serial_number_read(void *bar0, uint64_t *serial_number); /** * fw_io_get_total_ecc_err_counts() - Get UE ecc error count * @param bar0: from bar - * @param ue_ecc_count: Pointer to the ue counter - * @param repairable_err_count: Pointer to the repairable counter + * @param unrepairable_ecc_count: Pointer to the unrepairable ue counter + * @param repairable_err_count: Pointer to the repairable ue counter */ -void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *ue_ecc_count, uint32_t *repairable_ecc_count); +void fw_io_get_total_ecc_err_counts(void *bar0, uint32_t *unrepairable_ecc_count, uint32_t *repairable_ecc_count); /** * fw_io_hbm_uecc_repair_state_read() - Get HBM UE ecc repair state diff --git a/neuron_ioctl.h b/neuron_ioctl.h index b20170b..fc0c342 100644 --- a/neuron_ioctl.h +++ b/neuron_ioctl.h @@ -61,6 +61,24 @@ struct neuron_ioctl_mem_alloc_v2_mem_type64 { __u32 pad; // [dummy] used to descriminate between ioctl version }; +/* + * Extension of neuron_ioctl_mem_alloc_v2_mem_type64 that also returns PA. + * The driver detects this variant by its larger _IOC_SIZE, saving a + * separate NEURON_IOCTL_MEM_GET_PA ioctl call. + */ +struct neuron_ioctl_mem_alloc_v2_mem_type64_pa { + __u64 size; // [in] Allocation size + __u64 align; // [in] alignment + __u32 host_memory; // [in] If true allocates from host memory; else allocates from device memory + __u32 dram_channel; // [in] DRAM channel in device memory + __u32 dram_region; // [in] DRAM region in device memory + __u32 nc_id; // [in] NeuronCore id(valid only if location is device) + __u32 mem_type; // [in] type of allocation + __u64 *mem_handle; // [out] Allocated memory handle would stored here. + __u32 pad; // [dummy] used to descriminate between ioctl version + __u64 pa; // [out] Physical address of the allocated memory +}; + struct neuron_ioctl_device_init { /* Splits DRAM in the device into smaller regions. * This improves performance of DDR by allowing parallel DMA using different regions. @@ -265,7 +283,7 @@ struct neuron_ioctl_dma_queue_init { __u64 tx_handle; // [in] mem handle for the tx ring __u64 rx_handle; // [in] mem handle for the rx ring __u64 rxc_handle; // [in] mem handle for the rxc ring - __u32 axi_port; // [in] axi port + __u32 axi_port_unused; // unused }; #define MAX_DMA_QUEUE_INIT_BATCH 256 @@ -550,7 +568,7 @@ struct neuron_ioctl_host_device_id_to_rid_map { struct neuron_ioctl_hbm_scrub_start { __u32 nc_id; __u32 hbm_index; - __u32 axi_port; + __u32 axi_port_unused; __u32 init_val; }; @@ -652,6 +670,20 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues { } compl_queue_info[16]; }; +/** + * Pre-pinned host memory support + * Allows pinning host memory once and reusing for multiple DMA transfers. + * Uses VA as the lookup key - no handles exposed to userspace. + */ +struct neuron_ioctl_host_mem_pin { + __u64 va; // [in] User virtual address to pin + __u64 size; // [in] Size of memory to pin + __u64 pa; // [out] Physical address if contiguous, ~0ULL if not +}; + +struct neuron_ioctl_host_mem_unpin { + __u64 va; // [in] VA to unpin (must match exact VA from pin) +}; #define NEURON_IOCTL_BASE 'N' @@ -785,6 +817,7 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues { #define NEURON_IOCTL_MEM_ALLOC_V2 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2 *) // V2 here refers to neuron 2.x, not arch type #define NEURON_IOCTL_MEM_ALLOC_V2MT _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type) // just V2 with additional field mem_type #define NEURON_IOCTL_MEM_ALLOC_V2MT64 _IOR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64) // V2 + mem_type + pad +#define NEURON_IOCTL_MEM_ALLOC_V2MT64_PA _IOWR(NEURON_IOCTL_BASE, 102, struct neuron_ioctl_mem_alloc_v2_mem_type64_pa) // V2MT64 + pa output /** Resets the requested NC (-1 for full device) */ #define NEURON_IOCTL_NC_RESET _IOR(NEURON_IOCTL_BASE, 103, struct neuron_ioctl_device_reset *) @@ -873,4 +906,8 @@ struct neuron_ioctl_get_async_h2t_dma_compl_queues { #define NEURON_IOCTL_GET_ASYNC_H2T_DMA_COMPL_QUEUES _IOWR(NEURON_IOCTL_BASE, 135, struct neuron_ioctl_get_async_h2t_dma_compl_queues) +/** Pre-pinned host memory operations - zerocopy will auto-detect pinned memory */ +#define NEURON_IOCTL_HOST_MEM_PIN _IOWR(NEURON_IOCTL_BASE, 136, struct neuron_ioctl_host_mem_pin) +#define NEURON_IOCTL_HOST_MEM_UNPIN _IOW(NEURON_IOCTL_BASE, 137, struct neuron_ioctl_host_mem_unpin) + #endif diff --git a/neuron_metrics.c b/neuron_metrics.c index 65185fb..a3a8914 100644 --- a/neuron_metrics.c +++ b/neuron_metrics.c @@ -19,6 +19,7 @@ #include "neuron_device.h" #include "neuron_dhal.h" #include "neuron_power.h" +#include "neuron_sysfs_metrics.h" unsigned int nmetric_metric_post_delay = 150000; // milliseconds unsigned int nmetric_metric_sample_delay = 50; // milliseconds. @@ -1058,6 +1059,8 @@ static int nmetric_thread_fn(void *arg) u64 last_metric_post_time; u64 start_jiffies = jiffies; u64 current_slow_tick; + u64 last_health_tick_jiffies = jiffies; + const u64 health_tick_interval_jiffies = msecs_to_jiffies(60 * 1000); // health_status cache refresh cadence u8 tick_budget = 0; // how many ticks can be posted in a certain iteration of the loop // initialize all aggregation buffers @@ -1075,9 +1078,6 @@ static int nmetric_thread_fn(void *arg) post_delay_in_jiffies = msecs_to_jiffies(nmetric_metric_post_delay); last_metric_post_time = jiffies; - pr_info("Starting metrics thread, sample_delay_in_jiffies is %llu, post delay in ms is %u, timer rate = %d, \n", - sample_delay_in_jiffies, nmetric_metric_post_delay, HZ); - // metrics are only sent once at rate specified by module param, new metric data may be saved without being immediately sent while (!kthread_should_stop() && nd->metrics.neuron_aggregation.state != NMETRIC_STATE_STOPPED) { long wait_return; @@ -1098,6 +1098,12 @@ static int nmetric_thread_fn(void *arg) // There are some metrics that we sample at a relatively higher frequency. Do that here. nmetric_sample_high_freq(nd); + // Refresh health_status cached sysfs values + if ((jiffies - last_health_tick_jiffies) >= health_tick_interval_jiffies) { + nsysfsmetric_health_status_tick(nd); + last_health_tick_jiffies = jiffies; + } + // For the slower metrics, we want to log once every post_delay_in_jiffies jiffies. // We track this by keeping track of the number of intervals since this thread started // up so that we don't introduce drift due to the latency of other loop operations. diff --git a/neuron_mmap.c b/neuron_mmap.c index 6a7dda8..a4faf63 100644 --- a/neuron_mmap.c +++ b/neuron_mmap.c @@ -8,7 +8,9 @@ #include #include +#include #include "neuron_mmap.h" +#include "neuron_p2p.h" #include "neuron_pci.h" #include "neuron_device.h" #include "neuron_dhal.h" @@ -279,8 +281,10 @@ static struct mem_chunk *nmmap_get_mc(struct neuron_device *nd, struct vm_area_s * memchunk boundaries. */ if (mc->size != size && mc->alloc_type != NEURON_MEMALLOC_TYPE_CONTIGUOUS_SCRATCHPAD_DEVICE) { - pr_err("nd%d: partial mmap of mc not supported(%llx != %llx)\n", nd->device_index, - mc->size, size); + if (nmap_dm_special_resource_addr_valid(offset, size, NULL, NULL, NULL)) { + pr_err("nd%d: partial mmap of mc not supported(%llx != %llx)\n", nd->device_index, + mc->size, size); + } return NULL; } else if (mc->alloc_type == NEURON_MEMALLOC_TYPE_CONTIGUOUS_SCRATCHPAD_DEVICE) { if (mc->pa + size > mc->mp->main_pool_end_addr) { @@ -308,6 +312,7 @@ static const struct vm_operations_struct nmmap_dm_vm_ops = { static int nmmap_dm(struct neuron_device *nd, struct vm_area_struct *vma, u64 *bar4_offset) { + int ret; u64 start, size, offset; if (!nd->npdev.bar4_pa) { @@ -317,7 +322,11 @@ static int nmmap_dm(struct neuron_device *nd, struct vm_area_struct *vma, u64 *b start = vma->vm_pgoff << PAGE_SHIFT; size = vma->vm_end - vma->vm_start; - ndhal->ndhal_mmap.mmap_get_bar4_offset(start, size, &offset); + ret = ndhal->ndhal_mmap.mmap_get_bar4_offset(start, size, &offset); + if (unlikely(ret)) { + pr_err("Failed to map address 0x%llx to BAR4\n", start); + return ret; + } if (bar4_offset) *bar4_offset = offset; @@ -509,4 +518,28 @@ int nmmap_get_va_placement(void *va, int *device_index, int *hbm_index) return -ENXIO; } +/** + * nmmap_get_unmapped_area() - Return a huge page aligned VA for device mmaps whose + * offset and size are both huge page aligned. + */ +unsigned long nmmap_get_unmapped_area(struct file *filep, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long offset = pgoff << PAGE_SHIFT; + unsigned long aligned; + + if ((flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) || + (!IS_ALIGNED(offset, NEURON_P2P_HUGE_PAGE_SZ)) || + (!IS_ALIGNED(len, NEURON_P2P_HUGE_PAGE_SZ)) || + (len == 0 || len > ULONG_MAX - NEURON_P2P_HUGE_PAGE_SZ)) { + return nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags); + } + aligned = nmmap_kern_get_unmapped_area(filep, addr, len + NEURON_P2P_HUGE_PAGE_SZ, pgoff, flags); + if (IS_ERR_VALUE(aligned)) { + return nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags); + } + + return ALIGN(aligned, NEURON_P2P_HUGE_PAGE_SZ); +} diff --git a/neuron_mmap.h b/neuron_mmap.h index 190e753..a0cb3e3 100644 --- a/neuron_mmap.h +++ b/neuron_mmap.h @@ -18,6 +18,18 @@ #define RHEL_RELEASE_VERSION(a,b) 1 #endif +/* + * Linux 6.10 removed get_unmapped_area from mm_struct and replaced it + * with the standalone mm_get_unmapped_area() function. + */ +#if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE >= KERNEL_VERSION(6, 10, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 5))) +#define nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags) \ + mm_get_unmapped_area(current->mm, filep, addr, len, pgoff, flags) +#else +#define nmmap_kern_get_unmapped_area(filep, addr, len, pgoff, flags) \ + current->mm->get_unmapped_area(filep, addr, len, pgoff, flags) +#endif + #if (!defined(RHEL_RELEASE_CODE) && (LINUX_VERSION_CODE < KERNEL_VERSION(6, 3, 0))) || (defined(RHEL_RELEASE_CODE) && (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9, 5))) static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { @@ -148,4 +160,21 @@ struct mem_chunk *nmmap_get_mc_from_pa(struct neuron_device *nd, phys_addr_t pa) */ int nmmap_get_va_placement(void *va, int *device_index, int *hbm_index); + +/** + * nmmap_get_unmapped_area() - Return a huge page aligned VA for device mmaps + * whose offset and size are both huge page aligned. This enables EFA P2P MR + * registration to use 2MB pages instead of 4KB pages. + * + * @filep: file pointer + * @addr: address hint from userspace + * @len: mapping length + * @pgoff: page offset (device BAR offset) + * @flags: mmap flags + * + * Return: unmapped area address, or error value + */ +unsigned long nmmap_get_unmapped_area(struct file *filep, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); #endif diff --git a/neuron_module.c b/neuron_module.c index 56713ba..f397aa9 100644 --- a/neuron_module.c +++ b/neuron_module.c @@ -17,14 +17,16 @@ #include "neuron_trace.h" #include "neuron_cdev.h" #include "neuron_pci.h" +#include "neuron_dma.h" +#include "neuron_test.h" -MODULE_DESCRIPTION("Neuron Driver, built from SHA: 1c7ed9bd14936635773b5a01777882804ee8ea6e"); +MODULE_DESCRIPTION("Neuron Driver, built from SHA: 38e99b1fb8df603ee4109868c6d949e30f2c32c8"); MODULE_LICENSE("GPL"); -MODULE_VERSION("2.27.4.0"); +MODULE_VERSION("2.28.0.0"); MODULE_ALIAS("pci:v00001d0fd00007064sv*sd*bc*sc*i*"); -const char driver_version[] = "2.27.4.0"; -const char driver_revision[] = "1c7ed9bd14936635773b5a01777882804ee8ea6e"; +const char driver_version[] = "2.28.0.0"; +const char driver_revision[] = "38e99b1fb8df603ee4109868c6d949e30f2c32c8"; #ifdef CONFIG_FAULT_INJECTION @@ -75,6 +77,7 @@ static int __init neuron_module_init(void) #ifdef CONFIG_FAULT_INJECTION neuron_module_init_debugfs(); #endif + ntest_init(); ret = ncdev_module_init(); if (ret) @@ -92,6 +95,7 @@ static void __exit neuron_module_exit(void) #ifdef CONFIG_FAULT_INJECTION neuron_module_free_debugfs(); #endif + ndma_pinned_mem_destroy(); neuron_pci_module_exit(); ncdev_module_exit(); } diff --git a/neuron_p2p.c b/neuron_p2p.c index be76220..99e3c3f 100644 --- a/neuron_p2p.c +++ b/neuron_p2p.c @@ -17,9 +17,6 @@ #include "neuron_p2p.h" #include "neuron_pci.h" -#define NEURON_P2P_HUGE_PAGE_SZ 0x200000 -#define NEURON_P2P_HUGE_PAGE_SZ_USAGE_THRESHOLD 0x10000000 - /* * Registers the VA with the callback and also returns the PA */ diff --git a/neuron_p2p.h b/neuron_p2p.h index ce91df9..915886c 100644 --- a/neuron_p2p.h +++ b/neuron_p2p.h @@ -6,6 +6,9 @@ #ifndef __NEURON_P2P_H__ #define __NEURON_P2P_H__ +#define NEURON_P2P_HUGE_PAGE_SZ 0x200000 +#define NEURON_P2P_HUGE_PAGE_SZ_USAGE_THRESHOLD 0x10000000 + struct neuron_p2p_page_info { u64 physical_address; // PA's that map to the VA (page aligned as defined in va_info) u32 page_count; // page count each page is shift_page_size size diff --git a/neuron_pci.c b/neuron_pci.c index f385b3d..353fa8e 100644 --- a/neuron_pci.c +++ b/neuron_pci.c @@ -199,6 +199,7 @@ static int neuron_pci_device_close(struct neuron_device *nd) fw_io_destroy((struct fw_io_ctx *)nd->fw_io_ctx); nd->fw_io_ctx = NULL; + mutex_destroy(&nd->lock); return 0; } @@ -359,6 +360,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_info(dev, "Can't allocate memory for neuron_device\n"); goto fail_alloc_nd_mem; } + mutex_init(&nd->lock); nmetric_init_driver_metrics(nd); @@ -487,6 +489,7 @@ static int neuron_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) pci_disable_device(dev); fail_dhal_init: fail_enable: + mutex_destroy(&nd->lock); neuron_log_destroy( nd); kvfree(nd); fail_alloc_nd_mem: @@ -502,6 +505,8 @@ static void neuron_pci_remove(struct pci_dev *dev) if (nd == NULL) return; + ndma_h2d_stop_cmpltn_thread(nd); + nr_stop_thread(nd); nmetric_stop_thread(nd); diff --git a/neuron_reset.c b/neuron_reset.c index ff7b3a6..c8a95b4 100644 --- a/neuron_reset.c +++ b/neuron_reset.c @@ -20,6 +20,7 @@ #include "neuron_fw_io.h" #include "neuron_dhal.h" #include "neuron_nq.h" +#include "neuron_test.h" int no_reset = 0; module_param(no_reset, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); @@ -116,6 +117,8 @@ static int nr_reset_thread_fn(void *arg) (nc_map == NEURON_NC_MAP_DEVICE) ? "device" : "TPB", request_iter->request_id);) + ndmar_close_ncs(nd, nc_map); + ret = ndhal->ndhal_reset.nr_initiate_reset(nd, nc_map); if (ret) { char *reason = (ret == -EINTR) ? "interrupted by driver unload\n" : "failed\n"; @@ -128,7 +131,7 @@ static int nr_reset_thread_fn(void *arg) // If the reset was successfully initiated the // response we get back is a pass/fail and we don't need to retry. ret = ndhal->ndhal_reset.nr_wait_for_reset_completion(nd); - if (ret) { + if (ret || _ntest_trigger(NEURON_TEST_TRIGGER_RST_FAILURE, nd->device_index)) { nr_call_post_reset_config(nd, nc_map, false); ITER_COAL_REQS(request_iter, first_request, last_request, pr_info("nd%d: reset request %u was initiated, but failed to complete\n", nd->device_index, request_iter->request_id);) @@ -257,9 +260,12 @@ int nr_start_ncs(struct neuron_device *nd, uint32_t nc_map, uint32_t request_id) // perform the driver's reset related activities, then return so // that outside of not resetting HW, everything else will look natural. // - ndmar_init_ncs(nd, NEURON_NC_MAP_DEVICE); + ndmar_close_ncs(nd, nc_map); + ndmar_init_ncs(nd, nc_map); nr_call_post_reset_config(nd, nc_map, true); - nd->device_state = NEURON_DEVICE_STATE_READY; + if (request_id == NEURON_RESET_REQUEST_ALL) { + nd->device_state = NEURON_DEVICE_STATE_READY; + } return 0; } diff --git a/neuron_ring.c b/neuron_ring.c index 280e961..be7e868 100644 --- a/neuron_ring.c +++ b/neuron_ring.c @@ -85,11 +85,9 @@ u32 ndmar_ring_get_desc_count(u32 v) * @eng: dma engine * @qid: dma queue id in the engine for which the mc is being set. * @mc: backing memory chunk - * @port: which axi port(0 or 1) to access the DRAM(for performance) * @queue_type: type of the queue(rx, tx, or completion) */ -static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_chunk *mc, u32 port, - enum neuron_dma_queue_type queue_type) +static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_chunk *mc, enum neuron_dma_queue_type queue_type) { struct ndma_queue *queue = ndmar_get_queue(eng, qid); struct ndma_ring *ring = ndmar_get_ring(queue); @@ -102,9 +100,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c ring->tx.addr = virt_to_phys(ring->tx.ptr) | ndhal->ndhal_address_map.pci_host_base; } else { ring->tx.addr = mc->pa; - if (port) { - ring->tx.addr |= ndhal->ndhal_address_map.port_1_base; - } } break; case NEURON_DMA_QUEUE_TYPE_RX: @@ -114,9 +109,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c ring->rx.addr = virt_to_phys(ring->rx.ptr) | ndhal->ndhal_address_map.pci_host_base; } else { ring->rx.addr = mc->pa; - if (port) { - ring->rx.addr |= ndhal->ndhal_address_map.port_1_base; - } } break; case NEURON_DMA_QUEUE_TYPE_COMPLETION: @@ -127,9 +119,6 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c ring->rxc.addr = virt_to_phys(ring->rxc.ptr) | ndhal->ndhal_address_map.pci_host_base; } else { ring->rxc.addr = mc->pa; - if (port) { - ring->rxc.addr |= ndhal->ndhal_address_map.port_1_base; - } } break; default: @@ -139,7 +128,7 @@ static void ndmar_ring_set_mem_chunk(struct ndma_eng *eng, u32 qid, struct mem_c int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count, u32 rx_desc_count, struct mem_chunk *tx_mc, struct mem_chunk *rx_mc, - struct mem_chunk *rxc_mc, u32 port, bool allocatable) + struct mem_chunk *rxc_mc, bool allocatable) { int ret = -1; struct ndma_eng *eng; @@ -171,8 +160,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ ring->qid = qid; ring->h2t_completion_mc = NULL; - trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc, - port); + trace_dma_queue_init(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc); if (tx_mc) { /* @@ -180,7 +168,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ ret = -EINVAL; goto done; }*/ - ndmar_ring_set_mem_chunk(eng, qid, tx_mc, port, NEURON_DMA_QUEUE_TYPE_TX); + ndmar_ring_set_mem_chunk(eng, qid, tx_mc, NEURON_DMA_QUEUE_TYPE_TX); } if (rx_mc) { @@ -189,7 +177,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ ret = -EINVAL; goto done; }*/ - ndmar_ring_set_mem_chunk(eng, qid, rx_mc, port, NEURON_DMA_QUEUE_TYPE_RX); + ndmar_ring_set_mem_chunk(eng, qid, rx_mc, NEURON_DMA_QUEUE_TYPE_RX); } if (rxc_mc) { @@ -197,7 +185,7 @@ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_ ret = -EINVAL; goto done; } - ndmar_ring_set_mem_chunk(eng, qid, rxc_mc, port, NEURON_DMA_QUEUE_TYPE_COMPLETION); + ndmar_ring_set_mem_chunk(eng, qid, rxc_mc, NEURON_DMA_QUEUE_TYPE_COMPLETION); } ret = udma_m2m_init_queue(&eng->udma, qid, eng_id, tx_desc_count, rx_desc_count, allocatable, tx_mc != NULL ? &ring->tx : NULL, @@ -261,7 +249,7 @@ void ndmar_handle_process_exit(struct neuron_device *nd, pid_t pid) continue; } - ret = ndmar_queue_init(nd, eng_id, qid, desc_count, desc_count, mc, mc, NULL, 0, false); + ret = ndmar_queue_init(nd, eng_id, qid, desc_count, desc_count, mc, mc, NULL, false); // ignore the error and continue to reset other queues. if (ret) pr_err("nd%d:dma%d:q%d failed to reset (%d)", nd->device_index, eng_id, qid, ret); @@ -380,8 +368,8 @@ static int ndmar_h2t_ring_alloc(struct neuron_device *nd, int nc_id, int qid) goto error; } - ndmar_ring_set_mem_chunk(eng, qid, tx_mc, 0, NEURON_DMA_QUEUE_TYPE_TX); - ndmar_ring_set_mem_chunk(eng, qid, rx_mc, 0, NEURON_DMA_QUEUE_TYPE_RX); + ndmar_ring_set_mem_chunk(eng, qid, tx_mc, NEURON_DMA_QUEUE_TYPE_TX); + ndmar_ring_set_mem_chunk(eng, qid, rx_mc, NEURON_DMA_QUEUE_TYPE_RX); ret = mc_alloc_align(nd, MC_LIFESPAN_DEVICE, sizeof(u32) * 2 * NEURON_DMA_H2T_CTX_HANDLE_CNT, 0, MEM_LOC_HOST, 0, 0, nc_id, NEURON_MEMALLOC_TYPE_NCDEV_HOST, &h2t_completion_mc); if (ret) { diff --git a/neuron_ring.h b/neuron_ring.h index c9d3462..e99ea40 100644 --- a/neuron_ring.h +++ b/neuron_ring.h @@ -6,6 +6,8 @@ #ifndef NEURON_RING_H #define NEURON_RING_H +#include + #include "udma/udma.h" #include "share/neuron_driver_shared.h" @@ -23,6 +25,29 @@ struct neuron_dma_queue_state; struct ndma_eng; struct ndma_ring; +/* + * H2D DMA Completion Thread + * ------------------------- + * one thread per ND. + * It is shared across rings for completion, remote pinning, and submission work. + * + * Async IO only. + * + * @thread: kthread handle + * @wait_queue: wait queue used to sleep/wake the thread + * @nonempty_ctxq_bitmap: bitmap of H2D ctx queues with pending work + * @stop: set to request thread exit + */ +struct ndma_h2d_dma_cmpltn_thread { + struct task_struct *thread; + wait_queue_head_t wait_queue; + atomic64_t nonempty_ctxq_bitmap; + volatile bool stop; +}; + +int ndma_h2d_create_cmpltn_thread(struct neuron_device *nd); +void ndma_h2d_stop_cmpltn_thread(struct neuron_device *nd); + /* * H2D DMA Completion Queue (CQ) * ----------------------------- @@ -284,14 +309,13 @@ int ndmar_eng_set_state(struct neuron_device *nd, int eng_id, u32 state); * @tx_mc: Memory chunk backing TX queue * @rx_mc: Memory chunk backing RX queue * @rxc_mc: Memory chunk backing RX completion queue - * @port: AXI port. * @allocatable: whether new descriptors can be added post queue init * * Return: 0 if queue init succeeds, a negative error code otherwise. */ int ndmar_queue_init(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count, u32 rx_desc_count, struct mem_chunk *tx_mc, struct mem_chunk *rx_mc, - struct mem_chunk *rxc_mc, u32 port, bool allocatable); + struct mem_chunk *rxc_mc, bool allocatable); /** * ndmar_queue_release() - Release a DMA queue. @@ -426,7 +450,7 @@ int ndmar_h2t_ring_release(struct neuron_device *nd, int nc_id, int qid); /** * ndmar_h2t_ring_is_h2t() - return true if this is an h2t ring */ -static inline bool ndmar_h2t_ring_is_h2t(struct ndma_ring *ring) +static inline bool ndmar_h2t_ring_is_h2t(const struct ndma_ring *ring) { return (ring->h2t_completion_mc != NULL); } diff --git a/neuron_sysfs_metrics.c b/neuron_sysfs_metrics.c index fd71ae0..80a4947 100644 --- a/neuron_sysfs_metrics.c +++ b/neuron_sysfs_metrics.c @@ -12,6 +12,7 @@ #include "neuron_device.h" #include "neuron_ds.h" +#include "neuron_fw_io.h" #include "neuron_sysfs_metrics.h" #include "neuron_dhal.h" #include "neuron_power.h" @@ -150,6 +151,31 @@ static const nsysfsmetric_attr_info_t ecc_attrs_info_tbl[] = { }; static const int ecc_attrs_info_tbl_cnt = sizeof(ecc_attrs_info_tbl) / sizeof(nsysfsmetric_attr_info_t); +struct health_status_reg_map { + enum health_status_cache_slot slot; + u64 offset; + bool is_err_metric; +}; + +static const struct health_status_reg_map health_status_reg_tbl[] = { + { HEALTH_STATUS_SLOT_SRAM_ECC, FW_IO_REG_SRAM_ECC_OFFSET, true }, + { HEALTH_STATUS_SLOT_HBM0_ECC, FW_IO_REG_HBM0_ECC_OFFSET, true }, + { HEALTH_STATUS_SLOT_HBM1_ECC, FW_IO_REG_HBM1_ECC_OFFSET, true }, + { HEALTH_STATUS_SLOT_HBM2_ECC, FW_IO_REG_HBM2_ECC_OFFSET, true }, + { HEALTH_STATUS_SLOT_HBM3_ECC, FW_IO_REG_HBM3_ECC_OFFSET, true }, + { HEALTH_STATUS_SLOT_HBM_REPAIR_STATE, FW_IO_REG_HBM_REPAIR_STATE_OFFSET, true }, + { HEALTH_STATUS_SLOT_FW_API_VERSION, FW_IO_REG_API_VERSION_OFFSET, false }, +}; +static const int health_status_reg_tbl_cnt = sizeof(health_status_reg_tbl) / sizeof(health_status_reg_tbl[0]); + +static const nsysfsmetric_attr_info_t health_status_attrs_info_tbl[] = { + ATTR_INFO("hbm_ecc_err_count", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HBM_UE_COUNT), CACHED_VALUES), + ATTR_INFO("repairable_hbm_ecc_err_count", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT), CACHED_VALUES), + ATTR_INFO("sram_ecc_err_count", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT), CACHED_VALUES), + ATTR_INFO("hw_error_event", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT), CACHED_VALUES), +}; +static const int health_status_attrs_info_tbl_cnt = sizeof(health_status_attrs_info_tbl) / sizeof(nsysfsmetric_attr_info_t); + static const nsysfsmetric_attr_info_t root_arch_node_attrs_info_tbl[] = { ATTR_INFO("arch_type", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_OTHER_NEURON_ARCH_TYPE), OTHER), ATTR_INFO("instance_type", NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_OTHER_NEURON_INSTANCE_TYPE), OTHER), @@ -399,6 +425,46 @@ static ssize_t nsysfsmetric_show_nrt_other_metrics(struct nsysfsmetric_metrics * return len; } +static ssize_t nsysfsmetric_show_cached_values_metrics(struct nsysfsmetric_metrics *sysfs_metrics, + struct metric_attribute *attr, + char *buf) +{ + u32 value = 0; + + switch (attr->metric_id) { + case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT): + value = READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_SRAM_ECC]); + return nsysfsmetric_sysfs_emit(buf, "%u\n", value & 0xffff); // Lower 16 bits + case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HBM_UE_COUNT): + // TODO: Use cached HEALTH_STATUS_SLOT_FW_API_VERSION + // For now, safe to assume api_version >= 6 + value = 0; + value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM0_ECC])) >> 12) & 0xf); + value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM1_ECC])) >> 12) & 0xf); + value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM2_ECC])) >> 12) & 0xf); + value += (((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM3_ECC])) >> 12) & 0xf); + if ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM_REPAIR_STATE]) & 0x3) == 0x2) + value +=1; + + return nsysfsmetric_sysfs_emit(buf, "%u\n", value); + case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT): + value = 0; + value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM0_ECC])) & 0xfff); + value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM1_ECC])) & 0xfff); + value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM2_ECC])) & 0xfff); + value += ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM3_ECC])) & 0xfff); + if ((READ_ONCE(sysfs_metrics->cached_health_regs[HEALTH_STATUS_SLOT_HBM_REPAIR_STATE]) & 0x3) == 0x1) + value +=1; + + return nsysfsmetric_sysfs_emit(buf, "%u\n", value); + case NON_NDS_ID_TO_SYSFS_METRIC_ID(NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT): + return nsysfsmetric_sysfs_emit(buf, "%u\n", READ_ONCE(sysfs_metrics->hw_error_event_count)); + default: + pr_err("cannot show sysfs metrics for metric_id=%d of attr_type CACHED_VALUES\n", attr->metric_id); + return 0; + } +} + static ssize_t nsysfsmetric_set_nrt_total_metrics(struct nsysfsmetric_metrics *sysfs_metrics, struct metric_attribute *attr, const char *buf, size_t size) @@ -504,6 +570,11 @@ static struct metric_attribute *nsysfsmetric_create_attr(const char *metric_name metric_attr->show = nsysfsmetric_show_nrt_other_metrics; metric_attr->store = nsysfsmetric_set_nrt_other_metrics; break; + case CACHED_VALUES: + metric_attr->attr.mode = VERIFY_OCTAL_PERMISSIONS(S_IRUGO); + metric_attr->show = nsysfsmetric_show_cached_values_metrics; + metric_attr->store = NULL; + break; default: metric_attr->show = NULL; metric_attr->store = NULL; @@ -934,6 +1005,14 @@ int nsysfsmetric_register(struct neuron_device *nd, struct kobject *neuron_devic return ret; } + // neuron{0, 1, ...}/stats/hardware/health_status/ + if (ndhal->ndhal_sysfs_metrics.health_status_enabled && metrics->hardware_node) { + ret = nsysfsmetric_add_health_status_nodes(metrics, metrics->hardware_node); + if (ret) { + return ret; + } + } + // neuron{0, 1, ...}/neuron_core{0, 1, ...}/ ret = nsysfsmetric_init_and_add_nc_default_nodes(nd, &metrics->root); if (ret) { @@ -963,6 +1042,8 @@ static void nsysfsmetric_destroy_counters(struct nsysfsmetric_metrics *metrics) memset(metrics->nrt_metrics, 0, sizeof(metrics->nrt_metrics)); memset(metrics->nrt_nd_metrics, 0, sizeof(metrics->nrt_nd_metrics)); memset(metrics->dev_metrics, 0, sizeof(metrics->dev_metrics)); + metrics->hardware_node = NULL; + metrics->health_status_node = NULL; } static void nsysfsmetric_destroy_nodes(struct nsysfsmetric_node *node, bool acquire_lock) @@ -997,6 +1078,53 @@ void nsysfsmetric_destroy(struct neuron_device *nd) mutex_unlock(&nd->sysfs_metrics.root.lock); } +/* + * Reads a configured subset of misc RAM registers and updates the cache exposed via + * stats/hardware/health_status/. Bumps hw_error_event_count and issues sysfs_notify on any change. + * Invoked periodically from the metrics thread. + */ +void nsysfsmetric_health_status_tick(struct neuron_device *nd) +{ + struct nsysfsmetric_metrics *metrics = &nd->sysfs_metrics; + int i; + bool changed = false; + + if (!ndhal->ndhal_sysfs_metrics.health_status_enabled) + return; + + for (i = 0; i < health_status_reg_tbl_cnt; i++) { + u32 val; + int ret = fw_io_misc_ram_reg_read(nd->npdev.bar0, health_status_reg_tbl[i].offset, &val); + if (ret) + continue; // TODO: figure out how to communicate to sysfs readers that read failed + + if (val != READ_ONCE(metrics->cached_health_regs[health_status_reg_tbl[i].slot])) { + WRITE_ONCE(metrics->cached_health_regs[health_status_reg_tbl[i].slot], val); + if (health_status_reg_tbl[i].is_err_metric) { + changed = true; + } + } + } + + if (changed && metrics->health_status_node) { + // This function is the only writer, don't need atomic update, just volatile (READ_ONCE/WRITE_ONCE) + WRITE_ONCE(metrics->hw_error_event_count, READ_ONCE(metrics->hw_error_event_count) + 1); + sysfs_notify(&metrics->health_status_node->kobj, NULL, "hw_error_event"); + } +} + +int nsysfsmetric_add_health_status_nodes(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *hardware_node) +{ + struct nsysfsmetric_node *node = nsysfsmetric_init_and_add_one_node(metrics, hardware_node, + "health_status", false, -1, health_status_attrs_info_tbl_cnt, health_status_attrs_info_tbl); + if (!node) { + pr_err("failed to add health_status node under stats/hardware\n"); + return -1; + } + metrics->health_status_node = node; + return 0; +} + int nsysfsmetric_init_and_add_dynamic_counter_nodes(struct neuron_device *nd, uint64_t ds_val) { int ret = 0; diff --git a/neuron_sysfs_metrics.h b/neuron_sysfs_metrics.h index 27b30c8..9c2445c 100644 --- a/neuron_sysfs_metrics.h +++ b/neuron_sysfs_metrics.h @@ -26,6 +26,7 @@ enum nsysfsmetric_attr_type { PRESENT, // counter value at the current window PEAK, // max counter value OTHER, // all other types besides TOTAL, PRESENT, and PEAK + CACHED_VALUES, // cached value updated out-of-band (e.g., by a polling thread) }; enum nsysfsmetric_metric_id_category { @@ -73,6 +74,10 @@ enum nsysfsmetric_non_nds_ids { // The metrics needed by sysfs metrics but not s NON_NDS_OTHER_NOTIFY_DELAY, NON_NDS_OTHER_SERIAL_NUMBER, NON_NDS_OTHER_POWER_UTILIZATION, + NON_NDS_HEALTH_STATUS_SRAM_UE_COUNT, + NON_NDS_HEALTH_STATUS_HBM_UE_COUNT, + NON_NDS_HEALTH_STATUS_REPAIRABLE_HBM_UE_COUNT, + NON_NDS_HEALTH_STATUS_HW_ERROR_EVENT, }; struct neuron_device; @@ -83,6 +88,19 @@ struct sysfs_mem_thread { volatile bool stop; // if cleared, thread would exit the loop }; +// Cache slot identifiers for misc RAM registers whose values are exposed under +// stats/hardware/health_status/. Add a new value here when caching a new register. +enum health_status_cache_slot { + HEALTH_STATUS_SLOT_SRAM_ECC, + HEALTH_STATUS_SLOT_HBM0_ECC, + HEALTH_STATUS_SLOT_HBM1_ECC, + HEALTH_STATUS_SLOT_HBM2_ECC, + HEALTH_STATUS_SLOT_HBM3_ECC, + HEALTH_STATUS_SLOT_HBM_REPAIR_STATE, + HEALTH_STATUS_SLOT_FW_API_VERSION, + HEALTH_STATUS_SLOT_COUNT, +}; + struct nsysfsmetric_counter { struct nsysfsmetric_node *node; // used for sysfs_notify u64 total; @@ -107,6 +125,14 @@ struct nsysfsmetric_metrics { // per neuron_device // nc_id should be -1 to use nrt_nd_metrics, and should be a valid neuron core ID to use nrt_metrics struct nsysfsmetric_counter dev_metrics[MAX_METRIC_ID]; // TODO: the device metrics uint64_t bitmap; // store the dynamic metrics to be added + + // Cached misc RAM register values for stats/hardware/health_status/. + // Updated periodically from the metrics thread; sysfs reads return cached values. + // Indexed by enum health_status_cache_slot. + u32 cached_health_regs[HEALTH_STATUS_SLOT_COUNT]; + u32 hw_error_event_count; + struct nsysfsmetric_node *hardware_node; // stats/hardware/; cached so health_status can attach under it + struct nsysfsmetric_node *health_status_node; // target for sysfs_notify on hw_error_event }; typedef struct nsysfsmetric_attr_info { @@ -220,5 +246,14 @@ void nsysfsmetric_set_counter(struct neuron_device *nd, int metric_id_category, */ void nsysfsmetric_inc_reset_fail_count(struct neuron_device *nd); +/** + * nsysfsmetric_add_health_status_nodes() - add stats/hardware/health_status/ subtree under the hardware node + */ +int nsysfsmetric_add_health_status_nodes(struct nsysfsmetric_metrics *metrics, struct nsysfsmetric_node *hardware_node); + +/** + * nsysfsmetric_health_status_tick() - perform a single health_status cache refresh; invoked from the metrics thread + */ +void nsysfsmetric_health_status_tick(struct neuron_device *nd); #endif diff --git a/neuron_test.c b/neuron_test.c new file mode 100644 index 0000000..8ed9b5f --- /dev/null +++ b/neuron_test.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026, Amazon.com, Inc. or its affiliates. All Rights Reserved + */ + +/** Neuron driver test module + * + * the purpose of this module is to prove error injection functionality + * for testing. It should be lightweight, simple and have little to no + * knowledge of the driver's operation. It requires sysadmin caps + */ + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include "neuron_test.h" + +#define _NEURON_TT_AT_LOAD_VALBITS 16 +#define _NEURON_TT_AT_LOAD_VALSHIFT 0 +#define _NEURON_TT_AT_LOAD_VALMASK ((1 << _NEURON_TT_AT_LOAD_VALBITS)-1) +#define _NEURON_TT_AT_LOAD_VAL(val) (((val) >> _NEURON_TT_AT_LOAD_VALSHIFT) & _NEURON_TT_AT_LOAD_VALMASK) + +#define _NEURON_TT_AT_LOAD_DATABITS 16 +#define _NEURON_TT_AT_LOAD_DATASHIFT 16 +#define _NEURON_TT_AT_LOAD_DATAMASK ((1 << _NEURON_TT_AT_LOAD_DATABITS)-1) +#define _NEURON_TT_AT_LOAD_DATA(data) (((data) >> _NEURON_TT_AT_LOAD_DATASHIFT) & _NEURON_TT_AT_LOAD_DATAMASK) + +int neuron_test_trigger_ena = 0; +int neuron_test_trigger_at_load = 0; // loadtime testing trigger (16 bits of trigger value, 16 bits trigger data) + +module_param(neuron_test_trigger_ena, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(neuron_test_trigger_ena, "test trigger enable"); + +module_param(neuron_test_trigger_at_load, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(neuron_test_trigger_at_load, "test trigger at load time"); + +DECLARE_BITMAP(neuron_test_trigger_bitmap, NEURON_TEST_TRIGGER_MAX); + +// test trigger data, interpretted per trigger. Right now just u64, but we could get more sophisticated +// if needed. +// +static u64 neuron_test_trigger_data[NEURON_TEST_TRIGGER_MAX] = {0}; + +void ntest_init(void) +{ + bitmap_zero(neuron_test_trigger_bitmap, NEURON_TEST_TRIGGER_MAX); + + // set any load time test triggers + // + if (neuron_test_trigger_at_load) { + int val = _NEURON_TT_AT_LOAD_VAL(neuron_test_trigger_at_load); + if (val < NEURON_TEST_TRIGGER_MAX) { + bitmap_set(neuron_test_trigger_bitmap, val, 1); + neuron_test_trigger_data[val] = _NEURON_TT_AT_LOAD_DATA(neuron_test_trigger_at_load); + } + } +} + +//inline int _ntest_trigger(enum neuron_test_trigger trigger, void * trigger_data) {} + + +int ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data) +{ + switch (trigger) { + case NEURON_TEST_TRIGGER_RST_FAILURE: + if (test_bit(NEURON_TEST_TRIGGER_RST_FAILURE, neuron_test_trigger_bitmap) && + (trigger_data == neuron_test_trigger_data[NEURON_TEST_TRIGGER_RST_FAILURE])) { + return 1; + } + break; + default: + break; + } + return 0; +} diff --git a/neuron_test.h b/neuron_test.h new file mode 100644 index 0000000..51bdb31 --- /dev/null +++ b/neuron_test.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026, Amazon.com, Inc. or its affiliates. All Rights Reserved + */ + +#ifndef NEURON_TEST_H +#define NEURON_TEST_H + +#include + +enum neuron_test_trigger { + NEURON_TEST_TRIGGER_RST_FAILURE = 0, + NEURON_TEST_TRIGGER_MAX = 1, +}; + +extern int neuron_test_trigger_ena; + +void ntest_init(void); + +int ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data); + +static inline int _ntest_trigger(enum neuron_test_trigger trigger, u64 trigger_data) +{ + if (!neuron_test_trigger_ena) { + return 0; + } + return ntest_trigger(trigger, trigger_data); +} +#endif diff --git a/neuron_trace.h b/neuron_trace.h index e40cb8e..0782f2d 100644 --- a/neuron_trace.h +++ b/neuron_trace.h @@ -33,8 +33,8 @@ TRACE_EVENT(dma_engine_init, TRACE_EVENT(dma_queue_init, TP_PROTO(struct neuron_device *nd, u32 eng_id, u32 qid, u32 tx_desc_count, u32 rx_desc_count, struct mem_chunk *tx_mc, - struct mem_chunk *rx_mc, struct mem_chunk *rxc_mc, u32 port), - TP_ARGS(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc, port), + struct mem_chunk *rx_mc, struct mem_chunk *rxc_mc), + TP_ARGS(nd, eng_id, qid, tx_desc_count, rx_desc_count, tx_mc, rx_mc, rxc_mc), TP_STRUCT__entry( __field(u32, device_index) __field(u32, eng_id) @@ -44,7 +44,6 @@ TRACE_EVENT(dma_queue_init, __field(struct mem_chunk *, tx_mc) __field(struct mem_chunk *, rx_mc) __field(struct mem_chunk *, rxc_mc) - __field(u32, port) ), TP_fast_assign( __entry->device_index = nd->device_index; @@ -55,9 +54,8 @@ TRACE_EVENT(dma_queue_init, __entry->tx_mc = tx_mc; __entry->rx_mc = rx_mc; __entry->rxc_mc = rxc_mc; - __entry->port = port; ), - TP_printk("nd%d eng%d q%d tx_count %d rx_count %d tx %llx rx %llx rxc %llx port %d", + TP_printk("nd%d eng%d q%d tx_count %d rx_count %d tx %llx rx %llx rxc %llx", __entry->device_index, __entry->eng_id, __entry->qid, @@ -65,8 +63,7 @@ TRACE_EVENT(dma_queue_init, __entry->rx_desc_count, __entry->rx_mc->pa, __entry->tx_mc->pa, - __entry->rxc_mc == NULL ? 0 : __entry->rxc_mc->pa, - __entry->port + __entry->rxc_mc == NULL ? 0 : __entry->rxc_mc->pa )); TRACE_EVENT(dma_queue_release, diff --git a/postinstall b/postinstall index e2f44e5..393ad80 100755 --- a/postinstall +++ b/postinstall @@ -1,4 +1,4 @@ -#!/bin/sh -e +#!/usr/bin/sh -e rm -f "/etc/modules-load.d/neuron.conf" echo "neuron" | tee -a /etc/modules-load.d/neuron.conf diff --git a/postremove b/postremove index 3425aa9..d988ee3 100755 --- a/postremove +++ b/postremove @@ -1,4 +1,4 @@ -#!/bin/sh -e +#!/usr/bin/sh -e NEURON_PRES=$(dkms status | grep neuron) if [ -z "${NEURON_PRES}" ]; then diff --git a/share/neuron_driver_shared.h b/share/neuron_driver_shared.h index 030c19f..314f3a2 100644 --- a/share/neuron_driver_shared.h +++ b/share/neuron_driver_shared.h @@ -18,6 +18,8 @@ enum neuron_driver_feature_flag { NEURON_DRIVER_FEATURE_MEM_ALLOC64 = 1ull << 6, NEURON_DRIVER_FEATURE_CONTIGUOUS_SCRATCHPAD = 1ull << 7, NEURON_DRIVER_FEATURE_ZEROCOPY = 1ull << 8, + NEURON_DRIVER_FEATURE_PINNED_HOST_MEM = 1ull << 9, + NEURON_DRIVER_FEATURE_ALLOC_WITH_PA = 1ull << 10, }; // FIXME this should be more generic - like node type. @@ -189,6 +191,7 @@ struct neuron_ioctl_mem_chunk_info { #define NEURON_NC_MAP_MAX_ENTRIES 128 enum neuron_ioctl_nc_mapping_type { NEURON_IOCTL_NC_MAPPING_TYPE_V0 = 0, // seng swap mapping + NEURON_IOCTL_NC_MAPPING_TYPE_V1 = 1, // seng swap mapping but disable die-id flipping in ultra-server nodes. }; struct neuron_ioctl_nc_map_entry { __u32 device_id; @@ -210,7 +213,7 @@ typedef struct neuron_memcpy_batch { void *context; // [in] TBD. opaque context pointer passed back in completion queue } neuron_memcpy_batch_t; -/* H2D Completion Queue Entry (CQE) */ +/* H2D DMA Completion Queue Entry (CQE) */ typedef struct neuron_h2d_dma_compl_queue_entry { __u64 sequence_num; // Sequence number for the submitted IO request from runtime (0 means empty slot). __s64 compl_ret; // Completion status for the request (0 success; negative errno on failure; positive to be used for future). diff --git a/v2/neuron_dhal_v2.c b/v2/neuron_dhal_v2.c index 5fe4e61..f7c8de7 100644 --- a/v2/neuron_dhal_v2.c +++ b/v2/neuron_dhal_v2.c @@ -114,6 +114,18 @@ static int ndhal_register_funcs_inf2(void) { } +/* Device Arch Functions */ +/** + * narch_platform_ready() - return platform ready status + * Certain platforms operations require the platform to be in particular state + * + */ +static int narch_platform_ready_v2(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation) +{ + return 0; +} + + /* Device Reset Functions */ static void nr_get_tpb_reset_map(uint32_t nc_map, uint32_t *tpb_reset_map) { @@ -492,6 +504,32 @@ static int ndmar_get_h2t_def_qid_v2(uint32_t nc_id) return 0; } +/** + * ndmar_ctx_queue_bit_v2() - dummy ctx queue bitmap mapping for v2 + * @h2d_eng_id: ignored + * @qid: ignored + * + * Async IO is not supported on v2, so this hook is unused. + */ +static int ndmar_ctx_queue_bit_v2(uint32_t h2d_eng_id, uint32_t qid) +{ + return 0; +} + +/** + * ndmar_ctx_queue_from_bit_v2() - dummy ctx queue bitmap reverse mapping for v2 + * @bit: ignored + * @h2d_eng_id: returned DMA engine id placeholder + * @qid: returned DMA queue id placeholder + * + * Async IO is not supported on v2, so this hook is unused. + */ +static void ndmar_ctx_queue_from_bit_v2(int bit, uint32_t *h2d_eng_id, uint32_t *qid) +{ + *h2d_eng_id = 0; + *qid = 0; +} + /** * ndmar_is_h2t_def_q() - return true * @@ -751,9 +789,9 @@ static int fw_io_post_metric_v2(struct fw_io_ctx *ctx, u8 *data, u32 size) */ static int mmap_get_bar4_offset_v2(u64 start_addr, u64 size, u64 *offset) { - if (start_addr >= V2_HBM_0_BASE && start_addr + size < V2_HBM_0_BASE + V2_HBM_0_SIZE) + if (start_addr >= V2_HBM_0_BASE && start_addr + size <= V2_HBM_0_BASE + V2_HBM_0_SIZE) *offset = start_addr; - else if (start_addr >= V2_HBM_1_BASE && start_addr + size < V2_HBM_1_BASE + V2_HBM_1_SIZE) + else if (start_addr >= V2_HBM_1_BASE && start_addr + size <= V2_HBM_1_BASE + V2_HBM_1_SIZE) // The 64GB - 80GB range is mapped to 16GB - 32GB on bar4 *offset = start_addr - V2_HBM_1_BASE + V2_HBM_0_SIZE; else @@ -1382,6 +1420,7 @@ int ndhal_register_funcs_v2(void) { return -EINVAL; } + ndhal->ndhal_arch.narch_platform_ready = narch_platform_ready_v2; ndhal->ndhal_address_map.pci_host_base = V2_PCIE_A0_BASE; ndhal->ndhal_address_map.mmap_nc_event_offset = V2_MMAP_NC_EVENT_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V2_MMAP_NC_SEMA_READ_OFFSET; @@ -1389,7 +1428,6 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_address_map.mmap_nc_sema_incr_offset = V2_MMAP_NC_SEMA_INCR_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V2_MMAP_NC_SEMA_DECR_OFFSET; ndhal->ndhal_address_map.bar0_misc_ram_offset = V2_MMAP_BAR0_APB_MISC_RAM_OFFSET; - ndhal->ndhal_address_map.port_1_base = 0ull; ndhal->ndhal_address_map.nc_per_device = V2_NC_PER_DEVICE; ndhal->ndhal_address_map.dev_nc_map = (1 << V2_NC_PER_DEVICE) - 1; ndhal->ndhal_address_map.dice_per_device = V2_NUM_DIE_PER_DEVICE; @@ -1414,10 +1452,13 @@ int ndhal_register_funcs_v2(void) { ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v2; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v2; ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v2; + ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v2; + ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v2; ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v2; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v2; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v2; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v2; + ndhal->ndhal_fw_io.new_readless_read_min_api_version = U32_MAX; ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v2; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v2; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v2; diff --git a/v3/neuron_dhal_v3.c b/v3/neuron_dhal_v3.c index 40e683e..e70d99d 100644 --- a/v3/neuron_dhal_v3.c +++ b/v3/neuron_dhal_v3.c @@ -40,6 +40,10 @@ int force_die_flip = 0; module_param(force_die_flip, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); MODULE_PARM_DESC(force_die_flip, "Force Neuron Core Mapping APIs to give back DIE flip mappings"); +bool enable_sysfs_health_status_nodes = true; +module_param(enable_sysfs_health_status_nodes, bool, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); +MODULE_PARM_DESC(enable_sysfs_health_status_nodes, "Enable sysfs device health_status nodes"); + // TOP SP addresses are sparse on chip adjust to accommodate the table macro // #define V3_TOP_SP_GRP1_BASE V3_TOP_SP_0_BASE @@ -262,6 +266,23 @@ static enum neuron_platform_type ndhal_platform_type_v3(void) return platform_type; } + +/* Device Arch Functions */ +/** + * narch_platform_ready() - return platform ready status + * Certain platforms operations require the platform to be in particular state + * + */ +static int narch_platform_ready_v3(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation) +{ + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { + return 0; + } else { + return npe_platform_ready(nd, platform_operation); + } +} + + /* Device Reset Functions */ /** * nr_get_tpb_reset_map() - generates a the reset map of all resources associated with resetting a particular TPB @@ -332,15 +353,19 @@ static int nr_initiate_reset_v3(struct neuron_device *nd, uint32_t nc_map) static int nr_initiate_reset_v3_qemu(struct neuron_device *nd, uint32_t nc_map) { + uint32_t reset_val = nc_map; uint32_t tpb_reset_map_lo = 0, tpb_reset_map_hi = 0; volatile void *addr; if (no_reset) return 0; - nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi); + if (nc_map != NEURON_NC_MAP_DEVICE) { + nr_get_tpb_reset_map(nc_map, &tpb_reset_map_lo, &tpb_reset_map_hi); + reset_val = tpb_reset_map_lo; + } addr = nd->npdev.bar0 + V3_PCIE_BAR0_APB_IO_0_OFFSET + V3_APB_IO_0_USER_SE_0_RESERVED2_RELBASE + 0x10; - writel(tpb_reset_map_lo, (volatile uint32_t *)addr); + writel(reset_val, (volatile uint32_t *)addr); return 0; } @@ -419,6 +444,7 @@ static int nr_post_reset_config_v3(struct neuron_device *nd, bool reset_successf } else { nd->supports_hbm_7200 = 0; } + nd->current_perf_profile = 0; if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_STD) { return 0; @@ -711,6 +737,30 @@ static int ndmar_get_h2t_def_qid_v3(uint32_t nc_id) return nc_id % V3_NC_PER_SENG; } +/** + * ndmar_ctx_queue_bit_v3() - map a V3 H2D queue to a dense bitmap index + * @h2d_eng_id: DMA engine id in the V3 H2D/D2H engine range (128 to 131) + * @qid: DMA queue id within the engine + * + * Return bitmap bit index for the queue. + */ +static int ndmar_ctx_queue_bit_v3(uint32_t h2d_eng_id, uint32_t qid) +{ + return (h2d_eng_id - V3_D2H_0_IDX) * DMA_MAX_Q_V4 + qid; +} + +/** + * ndmar_ctx_queue_from_bit_v3() - map a dense bitmap index back to a V3 H2D queue + * @bit: bitmap bit index + * @h2d_eng_id: returned DMA engine id + * @qid: returned DMA queue id + */ +static void ndmar_ctx_queue_from_bit_v3(int bit, uint32_t *h2d_eng_id, uint32_t *qid) +{ + *h2d_eng_id = V3_D2H_0_IDX + (bit / DMA_MAX_Q_V4); + *qid = bit % DMA_MAX_Q_V4; +} + /** * ndmar_is_h2t_def_q() - return true * @@ -967,13 +1017,13 @@ static int mmap_get_bar4_offset_v3(u64 start_addr, u64 size, u64 *offset) { u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V3_HBM_SIZE; - if (start_addr >= V3_HBM_0_BASE && start_addr + size < V3_HBM_0_BASE + V3_HBM_ACTIVE_SIZE) + if (start_addr >= V3_HBM_0_BASE && start_addr + size <= V3_HBM_0_BASE + V3_HBM_ACTIVE_SIZE) *offset = start_addr; - else if (start_addr >= V3_HBM_1_BASE && start_addr + size < V3_HBM_1_BASE + V3_HBM_ACTIVE_SIZE) + else if (start_addr >= V3_HBM_1_BASE && start_addr + size <= V3_HBM_1_BASE + V3_HBM_ACTIVE_SIZE) *offset = start_addr - V3_HBM_1_BASE + hbm_dist; - else if (start_addr >= V3_HBM_2_BASE && start_addr + size < V3_HBM_2_BASE + V3_HBM_ACTIVE_SIZE) + else if (start_addr >= V3_HBM_2_BASE && start_addr + size <= V3_HBM_2_BASE + V3_HBM_ACTIVE_SIZE) *offset = start_addr - V3_HBM_2_BASE + hbm_dist * 2; - else if (start_addr >= V3_HBM_3_BASE && start_addr + size < V3_HBM_3_BASE + V3_HBM_ACTIVE_SIZE) + else if (start_addr >= V3_HBM_3_BASE && start_addr + size <= V3_HBM_3_BASE + V3_HBM_ACTIVE_SIZE) *offset = start_addr - V3_HBM_3_BASE + hbm_dist * 3; else return -EINVAL; @@ -1010,6 +1060,7 @@ static int nsysfsmetric_add_ecc_nodes_v3(struct nsysfsmetric_metrics *metrics, pr_err("failed to add hardware node its attributes under stats\n"); return -1; } + metrics->hardware_node = hardware_node; return 0; } @@ -1026,48 +1077,44 @@ static void nsysfsmetric_get_hbm_error_count_v3(struct neuron_device *nd, uint32_t *err_count) { int ret; - uint32_t total_uncorrected_ecc_err_count; - uint32_t total_repairable_ecc_err_count; + uint32_t total_unrepairable_ecc_err_count = 0; + uint32_t total_repairable_ecc_err_count = 0; uint32_t ecc_repair_state; *err_count = 0; + // read regs 17-20 + fw_io_get_total_ecc_err_counts(nd->npdev.bar0, &total_unrepairable_ecc_err_count, &total_repairable_ecc_err_count); + + // read reg 25 ret = fw_io_hbm_uecc_repair_state_read(nd->npdev.bar0, &ecc_repair_state); if (ret) { pr_err("sysfs failed to read HBM ECC repair state from FWIO\n"); return; } - fw_io_get_total_ecc_err_counts(nd->npdev.bar0, &total_uncorrected_ecc_err_count, &total_repairable_ecc_err_count); - - /* - * HBM Repair State Bitfield notes: - * 2 bits to represent the state of hbm repair - * 0x0 means no pending repair - * 0x1 means pending repair - * 0x2 means repair failure - */ - if (total_uncorrected_ecc_err_count == 0 && ecc_repair_state != 0) { - // For legacy firmware, there might be the case that (err count > 0 && repair state == 0), so allow this case - // When err count = 0, repair state must be 0x0 - pr_warn_once("[ND %d] Total Uncorrected ecc err count is %d, but repair state is %d which is invalid. Please contact Neuron for support.\n", nd->device_index, total_uncorrected_ecc_err_count, ecc_repair_state); - return; + + if (ecc_repair_state > 0x2) { + pr_warn_once("[ND %d] HBM unexpected ecc_repair_state: 0x%x\n", nd->device_index, ecc_repair_state); + } + + + if (ecc_repair_state == 0x2) { // repair failure + total_unrepairable_ecc_err_count += 1; } - // We did not complete the repair for some reason, in this case we expect that the error count is non-zero since the repairs have - // not gone through yet. If it is zero notify the user since this is unexpected. if (ecc_repair_state == 0x1 && total_repairable_ecc_err_count == 0) { - pr_warn_once("[ND %d] HBM repairs were not completed, but no repairable ecc errors were reported, which is invalid. Please contact Neuron for support.\n", nd->device_index); - return; - } + /* Known race condition: it may take upto 5 seconds to have consistent regs, but we can't wait that long + in a sysfs read. Increment repairable UE by 1 */ + pr_warn_once("[ND %d] HBM pending_repair but no repairable errors\n", nd->device_index); + total_repairable_ecc_err_count += 1; + } - // We failed to repair ECC memory but have not encountered a UECC yet. Proactively notify the user of this since the ECC - // will be more susceptible to errors in the future. - if (ecc_repair_state == 0x2 && total_uncorrected_ecc_err_count == 0) { - pr_warn_once("[ND %d] HBM repair failed. No uncorrectable ecc errors detected, however memory will be more suseptible to corruption. Please contact Neuron for support.\n", nd->device_index); - return; + if (ecc_repair_state == 0x0 && total_repairable_ecc_err_count > 0) { + /* Unexpected / unknown race condition */ + pr_warn_once("[ND %d] HBM repairable errors but no pending_repair\n", nd->device_index); } - *err_count = (repairable) ? total_repairable_ecc_err_count : total_uncorrected_ecc_err_count; + *err_count = (repairable) ? total_repairable_ecc_err_count : total_unrepairable_ecc_err_count; } /** @@ -1538,11 +1585,18 @@ static const struct neuron_ioctl_nc_map_entry nc_mapping_v0_seng_swap[] = { static_assert((NC_MAPPING_V0_SENG_SWAP_SIZE == NC_MAPPING_MAX_CORE_COUNT_V3) && (NC_MAPPING_V0_SENG_SWAP_SIZE <= NEURON_NC_MAP_MAX_ENTRIES)); static const uint32_t neuron_nc_map_die_flip_mask = 0x6; -static bool ndhal_die_flipped(void) +static bool ndhal_die_flipped(enum neuron_ioctl_nc_mapping_type version) { u32 state; s8 node_id; + if (version == NEURON_IOCTL_NC_MAPPING_TYPE_V1) { + if (force_die_flip) { + pr_info("Runtime disabled die id flipping. overriding driver force mode"); + } + return false; + } + if (force_die_flip) { return true; } @@ -1559,12 +1613,12 @@ static bool ndhal_die_flipped(void) static int ncdev_logical_to_physical_nc_map_v3(struct neuron_ioctl_nc_map *map, uint32_t max_num_entries, enum neuron_ioctl_nc_mapping_type version) { - bool apply_dieflip = ndhal_die_flipped(); + bool apply_dieflip = ndhal_die_flipped(version); uint32_t entry_idx; uint32_t entries_to_copy = (max_num_entries < NC_MAPPING_MAX_CORE_COUNT_V3) ? max_num_entries : NC_MAPPING_MAX_CORE_COUNT_V3; const struct neuron_ioctl_nc_map_entry *mapping; - if (version != NEURON_IOCTL_NC_MAPPING_TYPE_V0) { + if (version != NEURON_IOCTL_NC_MAPPING_TYPE_V0 && version != NEURON_IOCTL_NC_MAPPING_TYPE_V1) { pr_err("Unsupported Neuron Core Mapping verion %u for v3 arch", version); return -EINVAL; } @@ -1705,7 +1759,7 @@ static int perf_set_profile_v3(struct neuron_device *nd, uint32_t profile) if (retval == 0) { nd->current_perf_profile = cur_profile; } else { - nd->current_perf_profile = 0; + nd->current_perf_profile = -1; } } return ret; @@ -1874,6 +1928,7 @@ int ndhal_register_funcs_v3(void) { } ndhal->ndhal_arch.platform_type = ndhal_platform_type_v3(); + ndhal->ndhal_arch.narch_platform_ready = narch_platform_ready_v3; ndhal->ndhal_address_map.pci_host_base = V3_PCIE_A0_BASE; ndhal->ndhal_address_map.mmap_nc_event_offset = V3_MMAP_NC_EVENT_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_read_offset = V3_MMAP_NC_SEMA_READ_OFFSET; @@ -1881,7 +1936,6 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_address_map.mmap_nc_sema_incr_offset = V3_MMAP_NC_SEMA_INCR_OFFSET; ndhal->ndhal_address_map.mmap_nc_sema_decr_offset = V3_MMAP_NC_SEMA_DECR_OFFSET; ndhal->ndhal_address_map.bar0_misc_ram_offset = V3_MMAP_BAR0_APB_IO_0_MISC_RAM_OFFSET; - ndhal->ndhal_address_map.port_1_base = 0ull; ndhal->ndhal_address_map.nc_per_device = V3_NC_PER_DEVICE; ndhal->ndhal_address_map.dev_nc_map = (1 << V3_NC_PER_DEVICE) - 1; ndhal->ndhal_address_map.dice_per_device = V3_NUM_DIE_PER_DEVICE; @@ -1906,10 +1960,13 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v3; ndhal->ndhal_ndmar.ndmar_get_h2t_eng_id = ndmar_get_h2t_eng_id_v3; ndhal->ndhal_ndmar.ndmar_get_h2t_def_qid = ndmar_get_h2t_def_qid_v3; + ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v3; + ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v3; ndhal->ndhal_ndmar.ndmar_is_h2t_def_q = ndmar_is_h2t_def_q_v3; ndhal->ndhal_ndmar.nr_init_h2t_eng = nr_init_h2t_eng_v3; ndhal->ndhal_ndmar.ndmar_is_nx_ring = ndmar_is_nx_ring_v3; ndhal->ndhal_ndmar.ndmar_quiesce_queues = ndmar_quiesce_queues_v3; + ndhal->ndhal_fw_io.new_readless_read_min_api_version = 7; ndhal->ndhal_fw_io.fw_io_topology = fw_io_topology_v3; ndhal->ndhal_fw_io.fw_io_register_readless_read_region = fw_io_register_readless_read_region_v3; ndhal->ndhal_fw_io.fw_io_read_csr_array = fw_io_read_csr_array_v3; @@ -1922,6 +1979,7 @@ int ndhal_register_funcs_v3(void) { ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_ecc_nodes = nsysfsmetric_add_ecc_nodes_v3; ndhal->ndhal_sysfs_metrics.nsysfsmetric_get_hbm_error_count = nsysfsmetric_get_hbm_error_count_v3; ndhal->ndhal_sysfs_metrics.nsysfsmetric_add_tensor_engine_node = nsysfsmetric_add_tensor_engine_node_v3; + ndhal->ndhal_sysfs_metrics.health_status_enabled = enable_sysfs_health_status_nodes; ndhal->ndhal_pci.axi_bar = BAR_UNUSED; ndhal->ndhal_pci.apb_bar = 0; ndhal->ndhal_pci.dram_bar = 4; diff --git a/v3/neuron_pelect.c b/v3/neuron_pelect.c index 26249c9..96b2949 100644 --- a/v3/neuron_pelect.c +++ b/v3/neuron_pelect.c @@ -96,6 +96,9 @@ * - If an election fails due to broken links, we attempt to run the election using only one link pair in an attempt to for two 2-node pairs. * Currently we first attempt this on the right link, then if that fails, attempt the election again on the left link. * + * - Impact of device reset failure on election/configuration + * - If any device fails reset, the election is declared a failure and the platform will default to running single instance mode + * * Election Results: * Results of the election are reported in sysfs under /sys/class/neuron_device. * @@ -356,7 +359,7 @@ static int npe_pod_neighbor_io_init(pod_neighbor_io_t* pnio, struct neuron_devic goto done; } - ret = ndmar_queue_init(nd, pnio->eng_id, 0, pnio->ring_size, pnio->ring_size, pnio->tx_mc, pnio->rx_mc, NULL, 0, true); + ret = ndmar_queue_init(nd, pnio->eng_id, 0, pnio->ring_size, pnio->ring_size, pnio->tx_mc, pnio->rx_mc, NULL, true); if (ret) { pr_err("pod election io queue init failed"); goto done; @@ -1150,8 +1153,10 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) int node_cnt; u32 lr_neighbor_mask; u64 pod_serial_number; - + + // Declare election/configuration failed if any device fails reset if (!reset_successful) { + ndhal_pelect_data.pod_state_internal = NEURON_NPE_POD_ST_ELECTION_FAILURE; return 0; } @@ -1201,8 +1206,13 @@ int npe_election_exec_on_rst(struct neuron_device *nd, bool reset_successful) // if we aren't kicking off election on first driver reset (testing) or // if we aren't in init state then we've already made an election decision. + // Since election is happening for ultra server, skipping the election is + // applied for the ultra-server only. In case of PDS, ignore election skip + // ctl. // - if ((ndhal_pelect_data.pod_state_internal != NEURON_NPE_POD_ST_INIT) || npe_pod_ctl_is_set(NPE_POD_CTL_RST_SKIP_ELECTION)) { + if ((ndhal_pelect_data.pod_state_internal != NEURON_NPE_POD_ST_INIT) || + (ndhal->ndhal_arch.platform_type != NEURON_PLATFORM_TYPE_PDS && + npe_pod_ctl_is_set(NPE_POD_CTL_RST_SKIP_ELECTION))) { goto done; } @@ -1635,6 +1645,47 @@ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mod return ret; } +/** + * npe_platform_ready() - check if the platform can support a given operation + * + * UltraServers and PDS have to collect configuration data locally or from neighbors + * this function determines if the platform supports a paritcular operation. + * Currently the only thing we wait on is for config data to be available on PDS + * droplets. The time it takes to collect config data is constrained by reset time + * across devices, so we block opens until config is complete on PDS servers. + * + */ +int npe_platform_ready(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation) +{ + if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_PDS) { + switch (platform_operation) { + case NEURON_PLATFORM_OP_TYPE_DEVOPEN: + if (npe_pod_state_busy()) { + return -EBUSY; + } + return 0; + + case NEURON_PLATFORM_OP_TYPE_EXEC: + return 0; + + default: + break; + } + } else if (ndhal->ndhal_arch.platform_type == NEURON_PLATFORM_TYPE_ULTRASERVER) { + switch (platform_operation) { + case NEURON_PLATFORM_OP_TYPE_DEVOPEN: + return 0; + + case NEURON_PLATFORM_OP_TYPE_EXEC: + return 0; + + default: + break; + } + } + return 0; +} + static int npe_election_thread_fn(void *arg) { int ret; diff --git a/v3/neuron_pelect.h b/v3/neuron_pelect.h index 2e9f4a2..06f87ff 100644 --- a/v3/neuron_pelect.h +++ b/v3/neuron_pelect.h @@ -84,6 +84,12 @@ int npe_get_pod_status(u32 *state, u8 *node_id); */ int npe_pod_ctrl(struct neuron_device *nd, u32 ctrl, enum neuron_ultraserver_mode mode, u32 timeout, u32 *state); +/** + * npe_platform_ready() - return platform readiness for specified operation + * + */ +int npe_platform_ready(struct neuron_device *nd, enum neuron_platform_operation_type platform_operation); + /** * npe_class_node_id_show_data() - return sysfs class node_id * diff --git a/v4/neuron_dhal_v4.c b/v4/neuron_dhal_v4.c index 72b87c2..9ad23df 100644 --- a/v4/neuron_dhal_v4.c +++ b/v4/neuron_dhal_v4.c @@ -149,8 +149,9 @@ static int ndhal_register_funcs_trn3(void) { /* Instance names */ -#define NEURON_TRN3PDS_INSTANCE_NAME "trn3s.48xlarge" +#define NEURON_TRN3PDS_INSTANCE_NAME "trn3.48xlarge" #define NEURON_TRN3PDS0_INSTANCE_NAME "trn3-dev0.48xlarge" +#define NEURON_TRN3PDS1_INSTANCE_NAME "trn3-dev1.48xlarge" #define NEURON_TRN3P_INSTANCE_NAME "trn3p.48xlarge" static enum neuron_platform_type ndhal_platform_type_v4(void) @@ -161,7 +162,9 @@ static enum neuron_platform_type ndhal_platform_type_v4(void) if (narch_get_instance_type_name(buf, sizeof(buf))) goto done; if ((strncmp(buf, NEURON_TRN3PDS_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) { platform_type = NEURON_PLATFORM_TYPE_PDS; - } else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS_INSTANCE_NAME)-1) == 0)) { + } else if ((strncmp(buf, NEURON_TRN3PDS0_INSTANCE_NAME, sizeof(NEURON_TRN3PDS0_INSTANCE_NAME)-1) == 0)) { + platform_type = NEURON_PLATFORM_TYPE_PDS; + } else if ((strncmp(buf, NEURON_TRN3PDS1_INSTANCE_NAME, sizeof(NEURON_TRN3PDS1_INSTANCE_NAME)-1) == 0)) { platform_type = NEURON_PLATFORM_TYPE_PDS; } else if ((strncmp(buf, NEURON_TRN3P_INSTANCE_NAME, sizeof(NEURON_TRN3P_INSTANCE_NAME)-1) == 0)) { platform_type = NEURON_PLATFORM_TYPE_ULTRASERVER; @@ -169,6 +172,9 @@ static enum neuron_platform_type ndhal_platform_type_v4(void) platform_type = NEURON_PLATFORM_TYPE_STD; } + if (narch_is_qemu() || narch_is_emu()) + platform_type = NEURON_PLATFORM_TYPE_STD; + done: return platform_type; } @@ -187,6 +193,32 @@ static bool ndhal_instance_type_3xl(void) return instance_type_is_3xl; } +/** + * ndmar_ctx_queue_bit_v4() - dummy ctx queue bitmap mapping for v4 + * @h2d_eng_id: ignored + * @qid: ignored + * + * Async IO is not supported on v4 yet, so this hook is unused. + */ +static int ndmar_ctx_queue_bit_v4(uint32_t h2d_eng_id, uint32_t qid) +{ + return 0; +} + +/** + * ndmar_ctx_queue_from_bit_v4() - dummy ctx queue bitmap reverse mapping for v4 + * @bit: ignored + * @h2d_eng_id: returned DMA engine id placeholder + * @qid: returned DMA queue id placeholder + * + * Async IO is not supported on v4 yet, so this hook is unused. + */ +static void ndmar_ctx_queue_from_bit_v4(int bit, uint32_t *h2d_eng_id, uint32_t *qid) +{ + *h2d_eng_id = 0; + *qid = 0; +} + /* Memory Pool Functions */ /** @@ -252,13 +284,13 @@ static int mmap_get_bar4_offset_v4(u64 start_addr, u64 size, u64 *offset) { u64 hbm_dist = narch_is_qemu() ? (ndhal->ndhal_pci.dram_bar_size / 4) : V4_HBM_SIZE; - if (start_addr >= V4_HBM_0_BASE && start_addr + size < V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE) + if (start_addr >= V4_HBM_0_BASE && start_addr + size <= V4_HBM_0_BASE + V4_HBM_ACTIVE_SIZE) *offset = start_addr; - else if (start_addr >= V4_HBM_1_BASE && start_addr + size < V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE) + else if (start_addr >= V4_HBM_1_BASE && start_addr + size <= V4_HBM_1_BASE + V4_HBM_ACTIVE_SIZE) *offset = start_addr - V4_HBM_1_BASE + hbm_dist; - else if (start_addr >= V4_HBM_2_BASE && start_addr + size < V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE) + else if (start_addr >= V4_HBM_2_BASE && start_addr + size <= V4_HBM_2_BASE + V4_HBM_ACTIVE_SIZE) *offset = start_addr - V4_HBM_2_BASE + hbm_dist * 2; - else if (start_addr >= V4_HBM_3_BASE && start_addr + size < V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE) + else if (start_addr >= V4_HBM_3_BASE && start_addr + size <= V4_HBM_3_BASE + V4_HBM_ACTIVE_SIZE) *offset = start_addr - V4_HBM_3_BASE + hbm_dist * 3; else return -EINVAL; @@ -436,11 +468,14 @@ int ndhal_register_funcs_v4(void) { } ndhal->ndhal_arch.platform_type = ndhal_platform_type_v4(); + ndhal->ndhal_fw_io.new_readless_read_min_api_version = 6; ndhal->ndhal_pci.neuron_pci_get_device_id = neuron_pci_get_device_id_v4; ndhal->ndhal_npe.npe_neighbor_eng_ids = npe_neighbor_eng_ids_v4; ndhal->ndhal_mpset.mpset_set_dram_and_mpset_info = mpset_set_dram_and_mpset_info_v4; ndhal->ndhal_mmap.dm_mmap_special = dm_mmap_special_v4; ndhal->ndhal_mmap.mmap_get_bar4_offset = mmap_get_bar4_offset_v4; + ndhal->ndhal_ndmar.ndmar_ctx_queue_bit = ndmar_ctx_queue_bit_v4; + ndhal->ndhal_ndmar.ndmar_ctx_queue_from_bit = ndmar_ctx_queue_from_bit_v4; ndhal->ndhal_cdev.ncdev_mem_regions = ncdev_mem_regions_v4; ndhal->ndhal_perf.perf_update_hbm_7200_supported = perf_update_hbm_7200_supported_v4;